v2-0002-Support-loser-tree-for-k-way-merge.patch
application/octet-stream
Filename: v2-0002-Support-loser-tree-for-k-way-merge.patch
Type: application/octet-stream
Part: 1
From 9dbf07f28d257bd6aef8bacb38076f0da9fc8a45 Mon Sep 17 00:00:00 2001
From: ChangAo Chen <cca5507@qq.com>
Date: Fri, 5 Dec 2025 23:48:05 +0800
Subject: [PATCH v2 2/2] Support loser tree for k-way merge.
The loser tree usually has fewer comparisons than the heap during
k-way merge. But the heap maybe better if the cardinality of the
input data is very low. Add a GUC 'enable_loser_tree' to control
the use of loser tree, default to off.
---
doc/src/sgml/config.sgml | 14 ++
src/backend/utils/misc/guc_parameters.dat | 7 +
src/backend/utils/misc/postgresql.conf.sample | 1 +
src/backend/utils/sort/tuplesort.c | 218 ++++++++++++++----
src/include/utils/guc.h | 1 +
src/test/regress/expected/sysviews.out | 3 +-
6 files changed, 202 insertions(+), 42 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 405c9689bd0..031591bb820 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -5617,6 +5617,20 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
</listitem>
</varlistentry>
+ <varlistentry id="guc-enable-loser-tree" xreflabel="enable_loser_tree">
+ <term><varname>enable_loser_tree</varname> (<type>boolean</type>)
+ <indexterm>
+ <primary><varname>enable_loser_tree</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Enables or disables the external k-way merge's use of loser tree.
+ The default is <literal>off</literal>.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="guc-enable-material" xreflabel="enable_material">
<term><varname>enable_material</varname> (<type>boolean</type>)
<indexterm>
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 3b9d8349078..cbece911495 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -882,6 +882,13 @@
boot_val => 'true',
},
+{ name => 'enable_loser_tree', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
+ short_desc => 'Enables loser tree for external k-way merge.',
+ flags => 'GUC_EXPLAIN',
+ variable => 'enable_loser_tree',
+ boot_val => 'false',
+},
+
{ name => 'enable_material', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
short_desc => 'Enables the planner\'s use of materialization.',
flags => 'GUC_EXPLAIN',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index dc9e2255f8a..2ffba918a43 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -412,6 +412,7 @@
#enable_incremental_sort = on
#enable_indexscan = on
#enable_indexonlyscan = on
+#enable_loser_tree = off
#enable_material = on
#enable_memoize = on
#enable_mergejoin = on
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index e28b8cfa868..57c3d0266dc 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -42,16 +42,16 @@
* end of the input is reached, we dump out remaining tuples in memory into
* a final run, then merge the runs.
*
- * When merging runs, we use a heap containing just the frontmost tuple from
- * each source run; we repeatedly output the smallest tuple and replace it
- * with the next tuple from its source tape (if any). When the heap empties,
- * the merge is complete. The basic merge algorithm thus needs very little
- * memory --- only M tuples for an M-way merge, and M is constrained to a
- * small number. However, we can still make good use of our full workMem
- * allocation by pre-reading additional blocks from each source tape. Without
- * prereading, our access pattern to the temporary file would be very erratic;
- * on average we'd read one block from each of M source tapes during the same
- * time that we're writing M blocks to the output tape, so there is no
+ * When merging runs, we use a heap or loser tree containing just the frontmost
+ * tuple from each source run; we repeatedly output the smallest tuple and
+ * replace it with the next tuple from its source tape (if any). When the heap
+ * or loser tree empties, the merge is complete. The basic merge algorithm thus
+ * needs very little memory --- only M tuples for an M-way merge, and M is
+ * constrained to a small number. However, we can still make good use of our
+ * full workMem allocation by pre-reading additional blocks from each source tape.
+ * Without prereading, our access pattern to the temporary file would be very
+ * erratic; on average we'd read one block from each of M source tapes during
+ * the same time that we're writing M blocks to the output tape, so there is no
* sequentiality of access at all, defeating the read-ahead methods used by
* most Unix kernels. Worse, the output tape gets written into a very random
* sequence of blocks of the temp file, ensuring that things will be even
@@ -120,8 +120,12 @@
#define INITIAL_MEMTUPSIZE Max(1024, \
ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
+/* LOSER_TREE_EOF is always a loser in comparison */
+#define LOSER_TREE_EOF -1
+
/* GUC variables */
bool trace_sort = false;
+bool enable_loser_tree = false;
#ifdef DEBUG_BOUNDED_SORT
bool optimize_bounded_sort = true;
@@ -206,6 +210,8 @@ struct Tuplesortstate
* space */
TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */
LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
+ bool useLoserTree; /* use loser tree for k-way merge if true */
+ int16 *losers; /* array of losers, losers[0] is the winner */
/*
* This array holds the tuples now in sort memory. If we are in state
@@ -468,6 +474,8 @@ static void tuplesort_sort_memtuples(Tuplesortstate *state);
static void tuplesort_heap_build(Tuplesortstate *state);
static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple);
static void tuplesort_heap_delete_top(Tuplesortstate *state);
+static void tuplesort_loser_tree_build(Tuplesortstate *state);
+static void tuplesort_loser_tree_adjust(Tuplesortstate *state);
static void reversedirection(Tuplesortstate *state);
static unsigned int getlen(LogicalTape *tape, bool eofOK);
static void markrunend(LogicalTape *tape);
@@ -682,6 +690,10 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
state->base.tuples = true;
state->abbrevNext = 10;
+ /* Use loser tree for k-way merge if enable_loser_tree is true */
+ state->useLoserTree = enable_loser_tree;
+ state->losers = NULL;
+
/*
* workMem is forced to be at least 64KB, the current minimum valid value
* for the work_mem GUC. This is a defense against parallel sort callers
@@ -1652,11 +1664,12 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
*/
if (state->memtupcount > 0)
{
- int srcTapeIndex = state->memtuples[0].srctape;
+ int i = (state->useLoserTree ? state->losers[0] : 0);
+ int srcTapeIndex = state->memtuples[i].srctape;
LogicalTape *srcTape = state->inputTapes[srcTapeIndex];
SortTuple newtup;
- *stup = state->memtuples[0];
+ *stup = state->memtuples[i];
/*
* Remember the tuple we return, so that we can recycle its
@@ -1665,16 +1678,17 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
state->lastReturnedTuple = stup->tuple;
/*
- * Pull next tuple from tape, and replace the returned tuple
- * at top of the heap with it.
+ * Pull next tuple from tape, and adjust the heap or loser tree.
*/
if (!mergereadnext(state, srcTape, &newtup))
{
/*
* If no more data, we've reached end of run on this tape.
- * Remove the top node from the heap.
*/
- tuplesort_heap_delete_top(state);
+ if (state->useLoserTree)
+ state->memtuples[i].srctape = LOSER_TREE_EOF;
+ else
+ tuplesort_heap_delete_top(state);
state->nInputRuns--;
/*
@@ -1682,10 +1696,22 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
* anyway, but better to release the memory early.
*/
LogicalTapeClose(srcTape);
- return true;
}
- newtup.srctape = srcTapeIndex;
- tuplesort_heap_replace_top(state, &newtup);
+ else
+ {
+ newtup.srctape = srcTapeIndex;
+ if (state->useLoserTree)
+ state->memtuples[i] = newtup;
+ else
+ tuplesort_heap_replace_top(state, &newtup);
+ }
+
+ if (state->useLoserTree)
+ {
+ tuplesort_loser_tree_adjust(state);
+ if (state->losers[0] == LOSER_TREE_EOF)
+ state->memtupcount = 0;
+ }
return true;
}
return false;
@@ -2066,8 +2092,8 @@ mergeruns(Tuplesortstate *state)
init_slab_allocator(state, 0);
/*
- * Allocate a new 'memtuples' array, for the heap. It will hold one tuple
- * from each input tape.
+ * Allocate a new 'memtuples' array. It will hold one tuple from each
+ * input tape.
*
* We could shrink this, too, between passes in a multi-pass merge, but we
* don't bother. (The initial input tapes are still in outputTapes. The
@@ -2078,6 +2104,14 @@ mergeruns(Tuplesortstate *state)
state->nOutputTapes * sizeof(SortTuple));
USEMEM(state, GetMemoryChunkSpace(state->memtuples));
+ /* Allocate a 'losers' array if we will use loser tree for merge */
+ if (state->useLoserTree && state->losers == NULL)
+ {
+ state->losers = (int16 *) MemoryContextAlloc(state->base.maincontext,
+ state->nOutputTapes * sizeof(int16));
+ USEMEM(state, GetMemoryChunkSpace(state->losers));
+ }
+
/*
* Use all the remaining memory we have available for tape buffers among
* all the input tapes. At the beginning of each merge pass, we will
@@ -2195,62 +2229,72 @@ mergeruns(Tuplesortstate *state)
static void
mergeonerun(Tuplesortstate *state)
{
+ int i;
int srcTapeIndex;
LogicalTape *srcTape;
/*
* Start the merge by loading one tuple from each active source tape into
- * the heap.
+ * the heap or loser tree.
*/
beginmerge(state);
Assert(state->slabAllocatorUsed);
/*
- * Execute merge by repeatedly extracting lowest tuple in heap, writing it
- * out, and replacing it with next tuple from same tape (if there is
- * another one).
+ * Execute merge by repeatedly extracting the tuple in the heap or loser
+ * tree, writing it out, pulling next tuple from same tape (if there is
+ * another one), and adjusting the heap or loser tree.
*/
while (state->memtupcount > 0)
{
SortTuple stup;
/* write the tuple to destTape */
- srcTapeIndex = state->memtuples[0].srctape;
+ i = (state->useLoserTree ? state->losers[0] : 0);
+ srcTapeIndex = state->memtuples[i].srctape;
srcTape = state->inputTapes[srcTapeIndex];
- WRITETUP(state, state->destTape, &state->memtuples[0]);
+ WRITETUP(state, state->destTape, &state->memtuples[i]);
/* recycle the slot of the tuple we just wrote out, for the next read */
- if (state->memtuples[0].tuple)
- RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
+ if (state->memtuples[i].tuple)
+ RELEASE_SLAB_SLOT(state, state->memtuples[i].tuple);
- /*
- * pull next tuple from the tape, and replace the written-out tuple in
- * the heap with it.
- */
+ /* pull next tuple from the tape, and adjust the heap or loser tree */
if (mergereadnext(state, srcTape, &stup))
{
stup.srctape = srcTapeIndex;
- tuplesort_heap_replace_top(state, &stup);
+ if (state->useLoserTree)
+ state->memtuples[i] = stup;
+ else
+ tuplesort_heap_replace_top(state, &stup);
}
else
{
- tuplesort_heap_delete_top(state);
+ if (state->useLoserTree)
+ state->memtuples[i].srctape = LOSER_TREE_EOF;
+ else
+ tuplesort_heap_delete_top(state);
state->nInputRuns--;
}
+
+ if (state->useLoserTree)
+ {
+ tuplesort_loser_tree_adjust(state);
+ if (state->losers[0] == LOSER_TREE_EOF)
+ state->memtupcount = 0;
+ }
}
- /*
- * When the heap empties, we're done. Write an end-of-run marker on the
- * output tape.
- */
+ /* Done. Write an end-of-run marker on the output tape. */
markrunend(state->destTape);
}
/*
* beginmerge - initialize for a merge pass
*
- * Fill the merge heap with the first tuple from each input tape.
+ * Pull the first tuple from each input tape, and build a heap
+ * or loser tree for the merge pass.
*/
static void
beginmerge(Tuplesortstate *state)
@@ -2276,7 +2320,10 @@ beginmerge(Tuplesortstate *state)
}
}
- tuplesort_heap_build(state);
+ if (state->useLoserTree)
+ tuplesort_loser_tree_build(state);
+ else
+ tuplesort_heap_build(state);
}
/*
@@ -2826,6 +2873,95 @@ tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
memtuples[i] = *tuple;
}
+/*
+ * Build a valid loser tree from memtuples[].
+ */
+static void
+tuplesort_loser_tree_build(Tuplesortstate *state)
+{
+ SortTuple *memtuples = state->memtuples;
+ int k = state->memtupcount;
+ int16 winners[MAXORDER];
+ int i;
+
+ Assert(state->losers != NULL);
+ Assert(k <= MAXORDER);
+
+ /* this can happen? */
+ if (unlikely(k <= 0))
+ return;
+
+ /* 1-way merge, losers[0] is always 0 */
+ if (k == 1)
+ {
+ state->losers[0] = 0;
+ return;
+ }
+
+ /* fill the losers array */
+ for (i = k - 1; i > 0; i--)
+ {
+ int l = i * 2;
+ int r = l + 1;
+ int ll = (l >= k ? l - k : winners[l]);
+ int rr = (r >= k ? r - k : winners[r]);
+
+ if (COMPARETUP(state, &memtuples[ll], &memtuples[rr]) < 0)
+ {
+ winners[i] = ll;
+ state->losers[i] = rr;
+ }
+ else
+ {
+ winners[i] = rr;
+ state->losers[i] = ll;
+ }
+ }
+ state->losers[0] = winners[1];
+}
+
+/*
+ * Adjust the loser tree to get the new winner. Caller must have already
+ * pulled the next tuple from the last winner's input tape, and placed it
+ * to memtuples[losers[0]].
+ */
+static void
+tuplesort_loser_tree_adjust(Tuplesortstate *state)
+{
+ SortTuple *memtuples = state->memtuples;
+ int k = state->memtupcount;
+ int winner = state->losers[0];
+ int i = (winner + k) / 2;
+
+ Assert(state->losers != NULL);
+ Assert(k > 0 && k <= MAXORDER);
+ Assert(winner >= 0 && winner < k);
+
+ /*
+ * Reach end of run on the tape, set winner to LOSER_TREE_EOF so that
+ * it's always a loser in comparison.
+ */
+ if (memtuples[winner].srctape == LOSER_TREE_EOF)
+ winner = LOSER_TREE_EOF;
+
+ for (; i > 0; i /= 2)
+ {
+ int loser = state->losers[i];
+
+ /* LOSER_TREE_EOF is always a loser */
+ if (loser == LOSER_TREE_EOF)
+ continue;
+
+ if (winner == LOSER_TREE_EOF ||
+ COMPARETUP(state, &memtuples[winner], &memtuples[loser]) > 0)
+ {
+ state->losers[i] = winner;
+ winner = loser;
+ }
+ }
+ state->losers[0] = winner;
+}
+
/*
* Function to reverse the sort direction from its current state
*
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index f21ec37da89..8c22eed716b 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -324,6 +324,7 @@ extern PGDLLIMPORT int tcp_user_timeout;
extern PGDLLIMPORT char *role_string;
extern PGDLLIMPORT bool in_hot_standby_guc;
extern PGDLLIMPORT bool trace_sort;
+extern PGDLLIMPORT bool enable_loser_tree;
#ifdef DEBUG_BOUNDED_SORT
extern PGDLLIMPORT bool optimize_bounded_sort;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 0411db832f1..b492e9de6fd 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -159,6 +159,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_incremental_sort | on
enable_indexonlyscan | on
enable_indexscan | on
+ enable_loser_tree | off
enable_material | on
enable_memoize | on
enable_mergejoin | on
@@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_seqscan | on
enable_sort | on
enable_tidscan | on
-(25 rows)
+(26 rows)
-- There are always wait event descriptions for various types. InjectionPoint
-- may be present or absent, depending on history since last postmaster start.
--
2.52.0