From 9dbf07f28d257bd6aef8bacb38076f0da9fc8a45 Mon Sep 17 00:00:00 2001 From: ChangAo Chen Date: Fri, 5 Dec 2025 23:48:05 +0800 Subject: [PATCH v2 2/2] Support loser tree for k-way merge. The loser tree usually has fewer comparisons than the heap during k-way merge. But the heap maybe better if the cardinality of the input data is very low. Add a GUC 'enable_loser_tree' to control the use of loser tree, default to off. --- doc/src/sgml/config.sgml | 14 ++ src/backend/utils/misc/guc_parameters.dat | 7 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/backend/utils/sort/tuplesort.c | 218 ++++++++++++++---- src/include/utils/guc.h | 1 + src/test/regress/expected/sysviews.out | 3 +- 6 files changed, 202 insertions(+), 42 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 405c9689bd0..031591bb820 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -5617,6 +5617,20 @@ ANY num_sync ( + enable_loser_tree (boolean) + + enable_loser_tree configuration parameter + + + + + Enables or disables the external k-way merge's use of loser tree. + The default is off. + + + + enable_material (boolean) diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 3b9d8349078..cbece911495 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -882,6 +882,13 @@ boot_val => 'true', }, +{ name => 'enable_loser_tree', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables loser tree for external k-way merge.', + flags => 'GUC_EXPLAIN', + variable => 'enable_loser_tree', + boot_val => 'false', +}, + { name => 'enable_material', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', short_desc => 'Enables the planner\'s use of materialization.', flags => 'GUC_EXPLAIN', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index dc9e2255f8a..2ffba918a43 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -412,6 +412,7 @@ #enable_incremental_sort = on #enable_indexscan = on #enable_indexonlyscan = on +#enable_loser_tree = off #enable_material = on #enable_memoize = on #enable_mergejoin = on diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index e28b8cfa868..57c3d0266dc 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -42,16 +42,16 @@ * end of the input is reached, we dump out remaining tuples in memory into * a final run, then merge the runs. * - * When merging runs, we use a heap containing just the frontmost tuple from - * each source run; we repeatedly output the smallest tuple and replace it - * with the next tuple from its source tape (if any). When the heap empties, - * the merge is complete. The basic merge algorithm thus needs very little - * memory --- only M tuples for an M-way merge, and M is constrained to a - * small number. However, we can still make good use of our full workMem - * allocation by pre-reading additional blocks from each source tape. Without - * prereading, our access pattern to the temporary file would be very erratic; - * on average we'd read one block from each of M source tapes during the same - * time that we're writing M blocks to the output tape, so there is no + * When merging runs, we use a heap or loser tree containing just the frontmost + * tuple from each source run; we repeatedly output the smallest tuple and + * replace it with the next tuple from its source tape (if any). When the heap + * or loser tree empties, the merge is complete. The basic merge algorithm thus + * needs very little memory --- only M tuples for an M-way merge, and M is + * constrained to a small number. However, we can still make good use of our + * full workMem allocation by pre-reading additional blocks from each source tape. + * Without prereading, our access pattern to the temporary file would be very + * erratic; on average we'd read one block from each of M source tapes during + * the same time that we're writing M blocks to the output tape, so there is no * sequentiality of access at all, defeating the read-ahead methods used by * most Unix kernels. Worse, the output tape gets written into a very random * sequence of blocks of the temp file, ensuring that things will be even @@ -120,8 +120,12 @@ #define INITIAL_MEMTUPSIZE Max(1024, \ ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) +/* LOSER_TREE_EOF is always a loser in comparison */ +#define LOSER_TREE_EOF -1 + /* GUC variables */ bool trace_sort = false; +bool enable_loser_tree = false; #ifdef DEBUG_BOUNDED_SORT bool optimize_bounded_sort = true; @@ -206,6 +210,8 @@ struct Tuplesortstate * space */ TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + bool useLoserTree; /* use loser tree for k-way merge if true */ + int16 *losers; /* array of losers, losers[0] is the winner */ /* * This array holds the tuples now in sort memory. If we are in state @@ -468,6 +474,8 @@ static void tuplesort_sort_memtuples(Tuplesortstate *state); static void tuplesort_heap_build(Tuplesortstate *state); static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void tuplesort_loser_tree_build(Tuplesortstate *state); +static void tuplesort_loser_tree_adjust(Tuplesortstate *state); static void reversedirection(Tuplesortstate *state); static unsigned int getlen(LogicalTape *tape, bool eofOK); static void markrunend(LogicalTape *tape); @@ -682,6 +690,10 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt) state->base.tuples = true; state->abbrevNext = 10; + /* Use loser tree for k-way merge if enable_loser_tree is true */ + state->useLoserTree = enable_loser_tree; + state->losers = NULL; + /* * workMem is forced to be at least 64KB, the current minimum valid value * for the work_mem GUC. This is a defense against parallel sort callers @@ -1652,11 +1664,12 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward, */ if (state->memtupcount > 0) { - int srcTapeIndex = state->memtuples[0].srctape; + int i = (state->useLoserTree ? state->losers[0] : 0); + int srcTapeIndex = state->memtuples[i].srctape; LogicalTape *srcTape = state->inputTapes[srcTapeIndex]; SortTuple newtup; - *stup = state->memtuples[0]; + *stup = state->memtuples[i]; /* * Remember the tuple we return, so that we can recycle its @@ -1665,16 +1678,17 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward, state->lastReturnedTuple = stup->tuple; /* - * Pull next tuple from tape, and replace the returned tuple - * at top of the heap with it. + * Pull next tuple from tape, and adjust the heap or loser tree. */ if (!mergereadnext(state, srcTape, &newtup)) { /* * If no more data, we've reached end of run on this tape. - * Remove the top node from the heap. */ - tuplesort_heap_delete_top(state); + if (state->useLoserTree) + state->memtuples[i].srctape = LOSER_TREE_EOF; + else + tuplesort_heap_delete_top(state); state->nInputRuns--; /* @@ -1682,10 +1696,22 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward, * anyway, but better to release the memory early. */ LogicalTapeClose(srcTape); - return true; } - newtup.srctape = srcTapeIndex; - tuplesort_heap_replace_top(state, &newtup); + else + { + newtup.srctape = srcTapeIndex; + if (state->useLoserTree) + state->memtuples[i] = newtup; + else + tuplesort_heap_replace_top(state, &newtup); + } + + if (state->useLoserTree) + { + tuplesort_loser_tree_adjust(state); + if (state->losers[0] == LOSER_TREE_EOF) + state->memtupcount = 0; + } return true; } return false; @@ -2066,8 +2092,8 @@ mergeruns(Tuplesortstate *state) init_slab_allocator(state, 0); /* - * Allocate a new 'memtuples' array, for the heap. It will hold one tuple - * from each input tape. + * Allocate a new 'memtuples' array. It will hold one tuple from each + * input tape. * * We could shrink this, too, between passes in a multi-pass merge, but we * don't bother. (The initial input tapes are still in outputTapes. The @@ -2078,6 +2104,14 @@ mergeruns(Tuplesortstate *state) state->nOutputTapes * sizeof(SortTuple)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + /* Allocate a 'losers' array if we will use loser tree for merge */ + if (state->useLoserTree && state->losers == NULL) + { + state->losers = (int16 *) MemoryContextAlloc(state->base.maincontext, + state->nOutputTapes * sizeof(int16)); + USEMEM(state, GetMemoryChunkSpace(state->losers)); + } + /* * Use all the remaining memory we have available for tape buffers among * all the input tapes. At the beginning of each merge pass, we will @@ -2195,62 +2229,72 @@ mergeruns(Tuplesortstate *state) static void mergeonerun(Tuplesortstate *state) { + int i; int srcTapeIndex; LogicalTape *srcTape; /* * Start the merge by loading one tuple from each active source tape into - * the heap. + * the heap or loser tree. */ beginmerge(state); Assert(state->slabAllocatorUsed); /* - * Execute merge by repeatedly extracting lowest tuple in heap, writing it - * out, and replacing it with next tuple from same tape (if there is - * another one). + * Execute merge by repeatedly extracting the tuple in the heap or loser + * tree, writing it out, pulling next tuple from same tape (if there is + * another one), and adjusting the heap or loser tree. */ while (state->memtupcount > 0) { SortTuple stup; /* write the tuple to destTape */ - srcTapeIndex = state->memtuples[0].srctape; + i = (state->useLoserTree ? state->losers[0] : 0); + srcTapeIndex = state->memtuples[i].srctape; srcTape = state->inputTapes[srcTapeIndex]; - WRITETUP(state, state->destTape, &state->memtuples[0]); + WRITETUP(state, state->destTape, &state->memtuples[i]); /* recycle the slot of the tuple we just wrote out, for the next read */ - if (state->memtuples[0].tuple) - RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + if (state->memtuples[i].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[i].tuple); - /* - * pull next tuple from the tape, and replace the written-out tuple in - * the heap with it. - */ + /* pull next tuple from the tape, and adjust the heap or loser tree */ if (mergereadnext(state, srcTape, &stup)) { stup.srctape = srcTapeIndex; - tuplesort_heap_replace_top(state, &stup); + if (state->useLoserTree) + state->memtuples[i] = stup; + else + tuplesort_heap_replace_top(state, &stup); } else { - tuplesort_heap_delete_top(state); + if (state->useLoserTree) + state->memtuples[i].srctape = LOSER_TREE_EOF; + else + tuplesort_heap_delete_top(state); state->nInputRuns--; } + + if (state->useLoserTree) + { + tuplesort_loser_tree_adjust(state); + if (state->losers[0] == LOSER_TREE_EOF) + state->memtupcount = 0; + } } - /* - * When the heap empties, we're done. Write an end-of-run marker on the - * output tape. - */ + /* Done. Write an end-of-run marker on the output tape. */ markrunend(state->destTape); } /* * beginmerge - initialize for a merge pass * - * Fill the merge heap with the first tuple from each input tape. + * Pull the first tuple from each input tape, and build a heap + * or loser tree for the merge pass. */ static void beginmerge(Tuplesortstate *state) @@ -2276,7 +2320,10 @@ beginmerge(Tuplesortstate *state) } } - tuplesort_heap_build(state); + if (state->useLoserTree) + tuplesort_loser_tree_build(state); + else + tuplesort_heap_build(state); } /* @@ -2826,6 +2873,95 @@ tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) memtuples[i] = *tuple; } +/* + * Build a valid loser tree from memtuples[]. + */ +static void +tuplesort_loser_tree_build(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + int k = state->memtupcount; + int16 winners[MAXORDER]; + int i; + + Assert(state->losers != NULL); + Assert(k <= MAXORDER); + + /* this can happen? */ + if (unlikely(k <= 0)) + return; + + /* 1-way merge, losers[0] is always 0 */ + if (k == 1) + { + state->losers[0] = 0; + return; + } + + /* fill the losers array */ + for (i = k - 1; i > 0; i--) + { + int l = i * 2; + int r = l + 1; + int ll = (l >= k ? l - k : winners[l]); + int rr = (r >= k ? r - k : winners[r]); + + if (COMPARETUP(state, &memtuples[ll], &memtuples[rr]) < 0) + { + winners[i] = ll; + state->losers[i] = rr; + } + else + { + winners[i] = rr; + state->losers[i] = ll; + } + } + state->losers[0] = winners[1]; +} + +/* + * Adjust the loser tree to get the new winner. Caller must have already + * pulled the next tuple from the last winner's input tape, and placed it + * to memtuples[losers[0]]. + */ +static void +tuplesort_loser_tree_adjust(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + int k = state->memtupcount; + int winner = state->losers[0]; + int i = (winner + k) / 2; + + Assert(state->losers != NULL); + Assert(k > 0 && k <= MAXORDER); + Assert(winner >= 0 && winner < k); + + /* + * Reach end of run on the tape, set winner to LOSER_TREE_EOF so that + * it's always a loser in comparison. + */ + if (memtuples[winner].srctape == LOSER_TREE_EOF) + winner = LOSER_TREE_EOF; + + for (; i > 0; i /= 2) + { + int loser = state->losers[i]; + + /* LOSER_TREE_EOF is always a loser */ + if (loser == LOSER_TREE_EOF) + continue; + + if (winner == LOSER_TREE_EOF || + COMPARETUP(state, &memtuples[winner], &memtuples[loser]) > 0) + { + state->losers[i] = winner; + winner = loser; + } + } + state->losers[0] = winner; +} + /* * Function to reverse the sort direction from its current state * diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index f21ec37da89..8c22eed716b 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -324,6 +324,7 @@ extern PGDLLIMPORT int tcp_user_timeout; extern PGDLLIMPORT char *role_string; extern PGDLLIMPORT bool in_hot_standby_guc; extern PGDLLIMPORT bool trace_sort; +extern PGDLLIMPORT bool enable_loser_tree; #ifdef DEBUG_BOUNDED_SORT extern PGDLLIMPORT bool optimize_bounded_sort; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 0411db832f1..b492e9de6fd 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -159,6 +159,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_incremental_sort | on enable_indexonlyscan | on enable_indexscan | on + enable_loser_tree | off enable_material | on enable_memoize | on enable_mergejoin | on @@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(25 rows) +(26 rows) -- There are always wait event descriptions for various types. InjectionPoint -- may be present or absent, depending on history since last postmaster start. -- 2.52.0