From 0de44f8226e9f8c56942b1b4aa8ea9fc2faf77db Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Sat, 22 Oct 2022 00:06:28 +0200 Subject: [PATCH 07/11] wip: adjust watermark step Look at available statistics - number of possible watermark values, number of rows, work_mem, etc. and pick a good watermark_step value. To calculate step using statistics, set the GUC to 0: SET brinsort_watermark_step = 0; --- src/backend/commands/explain.c | 6 +++ src/backend/executor/nodeBrinSort.c | 21 ++++---- src/backend/optimizer/plan/createplan.c | 70 +++++++++++++++++++++++++ src/backend/utils/misc/guc_tables.c | 2 +- src/include/nodes/execnodes.h | 5 ++ src/include/nodes/plannodes.h | 3 ++ 6 files changed, 94 insertions(+), 13 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index a89ee03857d..7cf42a7649f 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2440,6 +2440,7 @@ static void show_brinsort_stats(BrinSortState *sortstate, List *ancestors, ExplainState *es) { BrinSortStats *stats = &sortstate->bs_stats; + BrinSort *plan = (BrinSort *) sortstate->ss.ps.plan; if (sortstate->bs_scan != NULL && sortstate->bs_scan->ranges != NULL) @@ -2462,6 +2463,9 @@ show_brinsort_stats(BrinSortState *sortstate, List *ancestors, ExplainState *es) if (stats->sort_count > 0) { + ExplainPropertyInteger("Average Step", NULL, (int64) + stats->watermark_updates_steps / stats->watermark_updates_count, es); + ExplainPropertyInteger("Ranges Processed", NULL, (int64) stats->range_count, es); @@ -2503,6 +2507,8 @@ show_brinsort_stats(BrinSortState *sortstate, List *ancestors, ExplainState *es) ExplainCloseGroup("Sorts", "Sorts", true, es); } } + else + ExplainPropertyInteger("Initial Step", NULL, (int64) plan->watermark_step, es); if (stats->sort_count_in_memory > 0) { diff --git a/src/backend/executor/nodeBrinSort.c b/src/backend/executor/nodeBrinSort.c index 15d15797735..f8356202b77 100644 --- a/src/backend/executor/nodeBrinSort.c +++ b/src/backend/executor/nodeBrinSort.c @@ -248,14 +248,6 @@ static void ExecInitBrinSortRanges(BrinSort *node, BrinSortState *planstate); bool debug_brin_sort = false; #endif -/* - * How many distinct minval values to look forward for the next watermark? - * - * The smallest step we can do is 1, which means the immediately following - * (while distinct) minval. - */ -int brinsort_watermark_step = 1; - /* do various consistency checks */ static void AssertCheckRanges(BrinSortState *node) @@ -359,9 +351,11 @@ brinsort_end_tidscan(BrinSortState *node) * a separate "first" parameter - "set=false" has the same meaning. */ static void -brinsort_update_watermark(BrinSortState *node, bool first, bool asc, int steps) +brinsort_update_watermark(BrinSortState *node, bool first, bool asc) { int cmp; + BrinSort *plan = (BrinSort *) node->ss.ps.plan; + int steps = plan->watermark_step; /* assume we haven't found a watermark */ bool found = false; @@ -449,6 +443,9 @@ brinsort_update_watermark(BrinSortState *node, bool first, bool asc, int steps) tuplesort_restorepos(node->bs_scan->ranges); + node->bs_stats.watermark_updates_count++; + node->bs_stats.watermark_updates_steps += plan->watermark_step; + node->bs_watermark_empty = (!found); } @@ -960,7 +957,7 @@ IndexNext(BrinSortState *node) node->bs_phase = BRINSORT_LOAD_RANGE; /* set the first watermark */ - brinsort_update_watermark(node, true, asc, brinsort_watermark_step); + brinsort_update_watermark(node, true, asc); } break; @@ -1081,7 +1078,7 @@ IndexNext(BrinSortState *node) { /* updte the watermark and try reading more ranges */ node->bs_phase = BRINSORT_LOAD_RANGE; - brinsort_update_watermark(node, false, asc, brinsort_watermark_step); + brinsort_update_watermark(node, false, asc); } break; @@ -1106,7 +1103,7 @@ IndexNext(BrinSortState *node) { brinsort_rescan(node); node->bs_phase = BRINSORT_LOAD_RANGE; - brinsort_update_watermark(node, true, asc, brinsort_watermark_step); + brinsort_update_watermark(node, true, asc); } else node->bs_phase = BRINSORT_FINISHED; diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index cd2935fa011..c88337bd310 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -18,6 +18,7 @@ #include +#include "access/brin.h" #include "access/genam.h" #include "access/sysattr.h" #include "catalog/pg_class.h" @@ -323,6 +324,14 @@ static GatherMerge *create_gather_merge_plan(PlannerInfo *root, GatherMergePath *best_path); +/* + * How many distinct minval values to look forward for the next watermark? + * + * The smallest step we can do is 1, which means the immediately following + * (while distinct) minval. + */ +int brinsort_watermark_step = 0; + /* * create_plan * Creates the access plan for a query by recursively processing the @@ -3416,6 +3425,67 @@ create_brinsort_plan(PlannerInfo *root, index_close(indexRel, NoLock); + /* + * determine watermark step (how fast to advance) + * + * If the brinsort_watermark_step is set to a non-zero value, we just use + * that value directly. Otherwise we pick a value using some simple + * heuristics - we don't want the rows to exceed work_mem, and we leave + * a bit slack (because we're adding batches of rows, not row by row). + * + * This has a weakness, because it assumes we incrementally add the same + * number of rows into the "sort" set - but imagine very wide overlapping + * ranges (e.g. random data on the same domain). Most of them will have + * about the same minval, so the sort grows only very slowly. Until the + * very last range, that removes the watermark and only then do most of + * the rows get to the tuplesort. + * + * XXX But maybe we can look at the other statistics we have, like number + * of overlaps and average range selectivity (% of tuples matching), and + * deduce something from that? + * + * XXX Could we maybe adjust the watermark step adaptively at runtime? + * That is, when we get to the "sort" step, maybe check how many rows + * are there, and if there are only few then try increasing the step? + */ + brinsort_plan->watermark_step = brinsort_watermark_step; + + if (brinsort_plan->watermark_step == 0) + { + BrinMinmaxStats *amstats; + + /**/ + Cardinality rows = brinsort_plan->scan.plan.plan_rows; + + /* estimate rowsize in the tuplesort */ + int width = brinsort_plan->scan.plan.plan_width; + int tupwidth = (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader)); + + /* Don't overflow work_mem (use only half to absorb variations. */ + int maxrows = (work_mem * 1024L / tupwidth / 2); + + /* If this is a LIMIT query, aim only for the required number of rows. */ + if (root->limit_tuples > 0) + maxrows = Min(maxrows, root->limit_tuples); + + /* Use the attnum calculated above. */ + amstats = (BrinMinmaxStats *) get_attindexam(brinsort_plan->indexid, + brinsort_plan->attnums[0]); + + if (amstats) + { + double pct_per_step = Max(amstats->minval_increment_avg, + amstats->maxval_increment_avg); + double rows_per_step = Max(1.0, pct_per_step * rows); + + brinsort_plan->watermark_step = (int) (maxrows / rows_per_step); + } + + /* some rough safety estimates */ + brinsort_plan->watermark_step = Max(brinsort_plan->watermark_step, 1); + brinsort_plan->watermark_step = Min(brinsort_plan->watermark_step, 8192); + } + return brinsort_plan; } diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index fe8f9c55799..63b164edbeb 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -3542,7 +3542,7 @@ struct config_int ConfigureNamesInt[] = GUC_NOT_IN_SAMPLE }, &brinsort_watermark_step, - 1, 1, INT_MAX, + 0, 0, INT_MAX, NULL, NULL, NULL }, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 2a98286e11a..06dc6416d99 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1664,6 +1664,10 @@ typedef struct BrinSortStats /* time to build ranges (milliseconds) */ int64 ranges_build_ms; + /* number/sum of watermark update steps */ + int64 watermark_updates_steps; + int64 watermark_updates_count; + } BrinSortStats; typedef struct BrinSortState @@ -1696,6 +1700,7 @@ typedef struct BrinSortState BrinRangeScanDesc *bs_scan; BrinRange *bs_range; ExprState *bs_qual; + int bs_watermark_step; Datum bs_watermark; bool bs_watermark_set; bool bs_watermark_empty; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 341dfc57826..659a6d110ee 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -531,6 +531,9 @@ typedef struct BrinSort /* NULLS FIRST/LAST directions */ bool *nullsFirst pg_node_attr(array_size(numCols)); + /* number of watermark steps to make */ + int watermark_step; + } BrinSort; /* ---------------- -- 2.39.2