From ecee0ec22432041baa830f4d98b4c5fc2d48f1f6 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 20 Oct 2022 13:03:00 +0200 Subject: [PATCH 06/10] wip: multiple watermark steps Allow incrementing the minval watermark faster, by skipping some minval values. This allows sorting more data at once (instead of many tiny sorts, which is inefficient). This also reduces the number of rows we need to spill (and possibly transfer multiple times). To use a different watermark step, use a new GUC: SET brinsort_watermark_step = 16 --- src/backend/commands/explain.c | 3 ++ src/backend/executor/nodeBrinSort.c | 59 ++++++++++++++++++++++++++--- src/backend/utils/misc/guc_tables.c | 12 ++++++ 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 91dcf7d6660..d84b118ac73 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -47,6 +47,7 @@ ExplainOneQuery_hook_type ExplainOneQuery_hook = NULL; /* Hook for plugins to get control in explain_get_index_name() */ explain_get_index_name_hook_type explain_get_index_name_hook = NULL; +extern int brinsort_watermark_step; /* OR-able flags for ExplainXMLTag() */ #define X_OPENING 0 @@ -2457,6 +2458,8 @@ show_brinsort_stats(BrinSortState *sortstate, List *ancestors, ExplainState *es) tuplesort_space_type_name(stats.spaceType)); } + ExplainPropertyInteger("Step", NULL, (int64) brinsort_watermark_step, es); + if (stats->sort_count > 0) { ExplainPropertyInteger("Ranges Processed", NULL, (int64) diff --git a/src/backend/executor/nodeBrinSort.c b/src/backend/executor/nodeBrinSort.c index 324b558ac6f..e96a27b3ea1 100644 --- a/src/backend/executor/nodeBrinSort.c +++ b/src/backend/executor/nodeBrinSort.c @@ -248,6 +248,14 @@ static void ExecInitBrinSortRanges(BrinSort *node, BrinSortState *planstate); bool debug_brin_sort = false; #endif +/* + * How many distinct minval values to look forward for the next watermark? + * + * The smallest step we can do is 1, which means the immediately following + * (while distinct) minval. + */ +int brinsort_watermark_step = 1; + /* do various consistency checks */ static void AssertCheckRanges(BrinSortState *node) @@ -351,11 +359,24 @@ brinsort_end_tidscan(BrinSortState *node) * a separate "first" parameter - "set=false" has the same meaning. */ static void -brinsort_update_watermark(BrinSortState *node, bool asc) +brinsort_update_watermark(BrinSortState *node, bool first, bool asc, int steps) { int cmp; + + /* assume we haven't found a watermark */ bool found = false; + Assert(steps > 0); + + /* + * If the watermark is empty, either this is the first call (in + * which case we just use the first (or rather second) value. + * Otherwise it means we've reached the end, so no point in looking + * for more watermarks. + */ + if (node->bs_watermark_empty && !first) + return; + tuplesort_markpos(node->bs_scan->ranges); while (tuplesort_gettupleslot(node->bs_scan->ranges, true, false, node->bs_scan->slot, NULL)) @@ -381,22 +402,48 @@ brinsort_update_watermark(BrinSortState *node, bool asc) else value = slot_getattr(node->bs_scan->slot, 7, &isnull); + /* + * Has to be the first call (otherwise we would not get here, because we + * terminate after bs_watermark_set gets flipped back to false), so we + * just set the value. But we don't count this as a step, because that + * just picks the first minval value, as we certainly need to do at least + * one more step. + * + * XXX Actually, do we need to make another step? Maybe there are enough + * not-summarized ranges? Although, we don't know what values are in + * those, ranges, and with increasing data we might easily end up just + * writing all of it into the spill tuplestore. So making one more step + * seems like a better idea - we'll at lest be able to produce something + * which is good for LIMIT queries. + */ if (!node->bs_watermark_set) { + Assert(first); node->bs_watermark_set = true; node->bs_watermark = value; + found = true; continue; } cmp = ApplySortComparator(node->bs_watermark, false, value, false, &node->bs_sortsupport); - if (cmp < 0) + /* + * Values should not decrease (or whatever the operator says, might + * be a DESC sort). + */ + Assert(cmp <= 0); + + if (cmp < 0) /* new watermark value */ { node->bs_watermark_set = true; node->bs_watermark = value; found = true; - break; + + steps--; + + if (steps == 0) + break; } } @@ -913,7 +960,7 @@ IndexNext(BrinSortState *node) node->bs_phase = BRINSORT_LOAD_RANGE; /* set the first watermark */ - brinsort_update_watermark(node, asc); + brinsort_update_watermark(node, true, asc, brinsort_watermark_step); } break; @@ -1034,7 +1081,7 @@ IndexNext(BrinSortState *node) { /* updte the watermark and try reading more ranges */ node->bs_phase = BRINSORT_LOAD_RANGE; - brinsort_update_watermark(node, asc); + brinsort_update_watermark(node, false, asc, brinsort_watermark_step); } break; @@ -1059,7 +1106,7 @@ IndexNext(BrinSortState *node) { brinsort_rescan(node); node->bs_phase = BRINSORT_LOAD_RANGE; - brinsort_update_watermark(node, asc); + brinsort_update_watermark(node, true, asc, brinsort_watermark_step); } else node->bs_phase = BRINSORT_FINISHED; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 775d220fce9..c36b0175344 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -95,6 +95,7 @@ extern char *temp_tablespaces; extern bool ignore_checksum_failure; extern bool ignore_invalid_pages; extern bool synchronize_seqscans; +extern int brinsort_watermark_step; #ifdef DEBUG_BRIN_STATS extern bool debug_brin_stats; @@ -3533,6 +3534,17 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"brinsort_watermark_step", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("sets the step for brinsort watermark increments"), + NULL, + GUC_NOT_IN_SAMPLE + }, + &brinsort_watermark_step, + 1, 1, INT_MAX, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL -- 2.39.1