From b62d0d39ceda5b8ea60da800318e81ff62611071 Mon Sep 17 00:00:00 2001 From: James Hunter Date: Wed, 26 Feb 2025 00:58:28 +0000 Subject: [PATCH 3/5] Add "workmem" estimate to Path and Plan nodes To allow for future optimizers to make decisions at Path time, this commit aggregates the Path's total working memory onto the Path's "workmem" field, normalized to a minimum of 64 KB and rounded up to the next whole KB. To allow future hooks to override ExecAssignWorkMem(), this commit then breaks that total working memory into per-data structure working memory, on the Plan. --- src/backend/executor/nodeHash.c | 13 +- src/backend/nodes/tidbitmap.c | 18 ++ src/backend/optimizer/path/costsize.c | 387 ++++++++++++++++++++++-- src/backend/optimizer/plan/createplan.c | 215 +++++++++++-- src/backend/optimizer/prep/prepagg.c | 12 + src/backend/optimizer/util/pathnode.c | 53 +++- src/include/executor/nodeHash.h | 3 +- src/include/nodes/pathnodes.h | 5 + src/include/nodes/plannodes.h | 7 +- src/include/nodes/primnodes.h | 3 + src/include/nodes/tidbitmap.h | 1 + src/include/optimizer/cost.h | 12 +- src/include/optimizer/planmain.h | 2 +- 13 files changed, 672 insertions(+), 59 deletions(-) diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index aee3c9ea67c..3f60f6305bd 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -35,6 +35,7 @@ #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "port/pg_bitutils.h" #include "utils/dynahash.h" #include "utils/guc.h" @@ -454,6 +455,7 @@ ExecHashTableCreate(HashState *state) int nbuckets; int nbatch; double rows; + int workmem; /* ignored */ int num_skew_mcvs; int log2_nbuckets; MemoryContext oldcxt; @@ -483,7 +485,7 @@ ExecHashTableCreate(HashState *state) state->parallel_state->nparticipants - 1 : 0, worker_space_allowed, &space_allowed, - &nbuckets, &nbatch, &num_skew_mcvs); + &nbuckets, &nbatch, &num_skew_mcvs, &workmem); /* nbuckets must be a power of 2 */ log2_nbuckets = my_log2(nbuckets); @@ -669,7 +671,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, size_t *total_space_allowed, int *numbuckets, int *numbatches, - int *num_skew_mcvs) + int *num_skew_mcvs, + int *workmem) { int tupsize; double inner_rel_bytes; @@ -800,6 +803,9 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, * the required bucket headers, we will need multiple batches. */ bucket_bytes = sizeof(HashJoinTuple) * nbuckets; + + *workmem = normalize_workmem(inner_rel_bytes + bucket_bytes); + if (inner_rel_bytes + bucket_bytes > hash_table_bytes) { /* We'll need multiple batches */ @@ -820,7 +826,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, total_space_allowed, numbuckets, numbatches, - num_skew_mcvs); + num_skew_mcvs, + workmem); return; } diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 3d835024caa..ac4c6b67350 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -1554,6 +1554,24 @@ tbm_calculate_entries(Size maxbytes) return (int) nbuckets; } +/* + * tbm_calculate_bytes + * + * Estimate number of bytes needed to store maxentries hashtable entries. + * + * This function is the inverse of tbm_calculate_entries(), and is used to + * estimate a work_mem limit, based on cardinality. + */ +double +tbm_calculate_bytes(double maxentries) +{ + maxentries = Min(maxentries, INT_MAX - 1); /* safety limit */ + maxentries = Max(maxentries, 16); /* sanity limit */ + + return maxentries * (sizeof(PagetableEntry) + sizeof(Pointer) + + sizeof(Pointer)); +} + /* * Create a shared or private bitmap iterator and start iteration. * diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 04360f45760..b455721fcb7 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -104,6 +104,7 @@ #include "optimizer/plancat.h" #include "optimizer/restrictinfo.h" #include "parser/parsetree.h" +#include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" #include "utils/spccache.h" @@ -200,9 +201,14 @@ static Cost append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers); static void set_rel_width(PlannerInfo *root, RelOptInfo *rel); static int32 get_expr_width(PlannerInfo *root, const Node *expr); -static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); static double get_parallel_divisor(Path *path); +static void compute_sort_output_sizes(double input_tuples, int input_width, + double limit_tuples, + double *output_tuples, + double *output_bytes); +static double compute_bitmap_workmem(RelOptInfo *baserel, Path *bitmapqual, + Cardinality max_ancestor_rows); /* @@ -1112,6 +1118,18 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, path->disabled_nodes = enable_bitmapscan ? 0 : 1; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + + + /* + * Set an overall working-memory estimate for the entire BitmapHeapPath -- + * including all of the IndexPaths and BitmapOrPaths in its bitmapqual. + * + * (When we convert this path into a BitmapHeapScan plan, we'll break this + * overall estimate down into per-node estimates, just as we do for + * AggPaths.) + */ + path->workmem = compute_bitmap_workmem(baserel, bitmapqual, + 0.0 /* max_ancestor_rows */ ); } /* @@ -1587,6 +1605,16 @@ cost_functionscan(Path *path, PlannerInfo *root, path->disabled_nodes = 0; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + + /* + * Per "XXX" comment above, this workmem estimate is likely to be wrong, + * because the "rows" estimate is pretty phony. Report the estimate + * anyway, for completeness. (This is at least better than saying it won't + * use *any* working memory.) + */ + path->workmem = list_length(rte->functions) * + normalize_workmem(relation_byte_size(path->rows, + path->pathtarget->width)); } /* @@ -1644,6 +1672,16 @@ cost_tablefuncscan(Path *path, PlannerInfo *root, path->disabled_nodes = 0; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + + /* + * Per "XXX" comment above, this workmem estimate is likely to be wrong, + * because the "rows" estimate is pretty phony. Report the estimate + * anyway, for completeness. (This is at least better than saying it won't + * use *any* working memory.) + */ + path->workmem = + normalize_workmem(relation_byte_size(path->rows, + path->pathtarget->width)); } /* @@ -1740,6 +1778,9 @@ cost_ctescan(Path *path, PlannerInfo *root, path->disabled_nodes = 0; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + path->workmem = + normalize_workmem(relation_byte_size(path->rows, + path->pathtarget->width)); } /* @@ -1823,7 +1864,7 @@ cost_resultscan(Path *path, PlannerInfo *root, * We are given Paths for the nonrecursive and recursive terms. */ void -cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) +cost_recursive_union(RecursiveUnionPath *runion, Path *nrterm, Path *rterm) { Cost startup_cost; Cost total_cost; @@ -1850,12 +1891,37 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) */ total_cost += cpu_tuple_cost * total_rows; - runion->disabled_nodes = nrterm->disabled_nodes + rterm->disabled_nodes; - runion->startup_cost = startup_cost; - runion->total_cost = total_cost; - runion->rows = total_rows; - runion->pathtarget->width = Max(nrterm->pathtarget->width, - rterm->pathtarget->width); + runion->path.disabled_nodes = nrterm->disabled_nodes + rterm->disabled_nodes; + runion->path.startup_cost = startup_cost; + runion->path.total_cost = total_cost; + runion->path.rows = total_rows; + runion->path.pathtarget->width = Max(nrterm->pathtarget->width, + rterm->pathtarget->width); + + /* + * Include memory for working and intermediate tables. Since we'll + * repeatedly swap the two tables, use 2x whichever is larger as our + * estimate. + */ + runion->path.workmem = + normalize_workmem( + Max(relation_byte_size(nrterm->rows, + nrterm->pathtarget->width), + relation_byte_size(rterm->rows, + rterm->pathtarget->width)) + * 2); + + if (list_length(runion->distinctList) > 0) + { + /* Also include memory for hash table. */ + Size hashentrysize; + + hashentrysize = MAXALIGN(runion->path.pathtarget->width) + + MAXALIGN(SizeofMinimalTupleHeader); + + runion->path.workmem += + normalize_workmem(runion->numGroups * hashentrysize); + } } /* @@ -1895,7 +1961,7 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound */ static void -cost_tuplesort(Cost *startup_cost, Cost *run_cost, +cost_tuplesort(Cost *startup_cost, Cost *run_cost, Cost *nbytes, double tuples, int width, Cost comparison_cost, int sort_mem, double limit_tuples) @@ -1915,17 +1981,8 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost, /* Include the default cost-per-comparison */ comparison_cost += 2.0 * cpu_operator_cost; - /* Do we have a useful LIMIT? */ - if (limit_tuples > 0 && limit_tuples < tuples) - { - output_tuples = limit_tuples; - output_bytes = relation_byte_size(output_tuples, width); - } - else - { - output_tuples = tuples; - output_bytes = input_bytes; - } + compute_sort_output_sizes(tuples, width, limit_tuples, + &output_tuples, &output_bytes); if (output_bytes > sort_mem_bytes) { @@ -1982,6 +2039,7 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost, * counting the LIMIT otherwise. */ *run_cost = cpu_operator_cost * tuples; + *nbytes = output_bytes; } /* @@ -2011,6 +2069,7 @@ cost_incremental_sort(Path *path, input_groups; Cost group_startup_cost, group_run_cost, + group_nbytes, group_input_run_cost; List *presortedExprs = NIL; ListCell *l; @@ -2085,7 +2144,7 @@ cost_incremental_sort(Path *path, * Estimate the average cost of sorting of one group where presorted keys * are equal. */ - cost_tuplesort(&group_startup_cost, &group_run_cost, + cost_tuplesort(&group_startup_cost, &group_run_cost, &group_nbytes, group_tuples, width, comparison_cost, sort_mem, limit_tuples); @@ -2126,6 +2185,14 @@ cost_incremental_sort(Path *path, path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + + /* + * Incremental sort switches between two Tuplesortstates: one that sorts + * all columns ("full"), and that sorts only suffix columns ("prefix"). + * We'll assume they're both around the same size: large enough to hold + * one sort group. + */ + path->workmem = normalize_workmem(group_nbytes * 2.0); } /* @@ -2150,8 +2217,9 @@ cost_sort(Path *path, PlannerInfo *root, { Cost startup_cost; Cost run_cost; + Cost nbytes; - cost_tuplesort(&startup_cost, &run_cost, + cost_tuplesort(&startup_cost, &run_cost, &nbytes, tuples, width, comparison_cost, sort_mem, limit_tuples); @@ -2162,6 +2230,7 @@ cost_sort(Path *path, PlannerInfo *root, path->disabled_nodes = input_disabled_nodes + (enable_sort ? 0 : 1); path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + path->workmem = normalize_workmem(nbytes); } /* @@ -2522,6 +2591,7 @@ cost_material(Path *path, path->disabled_nodes = input_disabled_nodes + (enable_material ? 0 : 1); path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + path->workmem = normalize_workmem(nbytes); } /* @@ -2592,6 +2662,9 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0) ndistinct = calls; + /* How much working memory would we need, to store every distinct tuple? */ + mpath->path.workmem = normalize_workmem(ndistinct * est_entry_bytes); + /* * Since we've already estimated the maximum number of entries we can * store at once and know the estimated number of distinct values we'll be @@ -2867,6 +2940,19 @@ cost_agg(Path *path, PlannerInfo *root, path->disabled_nodes = disabled_nodes; path->startup_cost = startup_cost; path->total_cost = total_cost; + + /* Include memory needed to produce output. */ + path->workmem = + compute_agg_output_workmem(root, aggstrategy, numGroups, + aggcosts->transitionSpace, input_tuples, + input_width, false /* cost_sort */ ); + + /* Also include memory needed to sort inputs (if needed): */ + if (aggcosts->numSorts > 0) + { + path->workmem += (double) aggcosts->numSorts * + compute_agg_input_workmem(input_tuples, input_width); + } } /* @@ -3101,7 +3187,7 @@ cost_windowagg(Path *path, PlannerInfo *root, List *windowFuncs, WindowClause *winclause, int input_disabled_nodes, Cost input_startup_cost, Cost input_total_cost, - double input_tuples) + double input_tuples, int width) { Cost startup_cost; Cost total_cost; @@ -3183,6 +3269,11 @@ cost_windowagg(Path *path, PlannerInfo *root, if (startup_tuples > 1.0) path->startup_cost += (total_cost - startup_cost) / input_tuples * (startup_tuples - 1.0); + + + /* We need to store a window of size "startup_tuples", in a Tuplestore. */ + path->workmem = + normalize_workmem(relation_byte_size(startup_tuples, width)); } /* @@ -3337,6 +3428,7 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace, workspace->total_cost = startup_cost + run_cost; /* Save private data for final_cost_nestloop */ workspace->run_cost = run_cost; + workspace->workmem = 0; } /* @@ -3800,6 +3892,14 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace, workspace->total_cost = startup_cost + run_cost + inner_run_cost; /* Save private data for final_cost_mergejoin */ workspace->run_cost = run_cost; + + /* + * By itself, Merge Join requires no working memory. If it adds one or + * more Sort or Material nodes, we'll track their working memory when we + * create them, inside createplan.c. + */ + workspace->workmem = 0; + workspace->inner_run_cost = inner_run_cost; workspace->outer_rows = outer_rows; workspace->inner_rows = inner_rows; @@ -4171,6 +4271,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, double outer_path_rows = outer_path->rows; double inner_path_rows = inner_path->rows; double inner_path_rows_total = inner_path_rows; + int workmem; int num_hashclauses = list_length(hashclauses); int numbuckets; int numbatches; @@ -4229,7 +4330,8 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, &space_allowed, &numbuckets, &numbatches, - &num_skew_mcvs); + &num_skew_mcvs, + &workmem); /* * If inner relation is too big then we will need to "batch" the join, @@ -4260,6 +4362,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, workspace->numbuckets = numbuckets; workspace->numbatches = numbatches; workspace->inner_rows_total = inner_path_rows_total; + workspace->workmem = workmem; } /* @@ -4268,8 +4371,8 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, * * Note: the numbatches estimate is also saved into 'path' for use later * - * 'path' is already filled in except for the rows and cost fields and - * num_batches + * 'path' is already filled in except for the rows and cost fields, + * num_batches, and workmem * 'workspace' is the result from initial_cost_hashjoin * 'extra' contains miscellaneous information about the join */ @@ -4286,6 +4389,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, List *hashclauses = path->path_hashclauses; Cost startup_cost = workspace->startup_cost; Cost run_cost = workspace->run_cost; + int workmem = workspace->workmem; int numbuckets = workspace->numbuckets; int numbatches = workspace->numbatches; Cost cpu_per_tuple; @@ -4512,6 +4616,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, path->jpath.path.startup_cost = startup_cost; path->jpath.path.total_cost = startup_cost + run_cost; + path->jpath.path.workmem = workmem; } @@ -4534,6 +4639,9 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan) if (subplan->useHashTable) { + long nbuckets; + Size hashentrysize; + /* * If we are using a hash table for the subquery outputs, then the * cost of evaluating the query is a one-time cost. We charge one @@ -4543,6 +4651,37 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan) sp_cost.startup += plan->total_cost + cpu_operator_cost * plan->plan_rows; + /* + * Estimate working memory needed for the hashtable (and hashnulls, if + * needed). The logic below MUST match the logic in buildSubPlanHash() + * and ExecInitSubPlan(). + */ + nbuckets = clamp_cardinality_to_long(plan->plan_rows); + if (nbuckets < 1) + nbuckets = 1; + + hashentrysize = MAXALIGN(plan->plan_width) + + MAXALIGN(SizeofMinimalTupleHeader); + + subplan->hashtab_workmem = + normalize_workmem((double) nbuckets * hashentrysize); + + if (!subplan->unknownEqFalse) + { + /* Also needs a hashnulls table. */ + if (IsA(subplan->testexpr, OpExpr)) + nbuckets = 1; /* there can be only one entry */ + else + { + nbuckets /= 16; + if (nbuckets < 1) + nbuckets = 1; + } + + subplan->hashnul_workmem = + normalize_workmem((double) nbuckets * hashentrysize); + } + /* * The per-tuple costs include the cost of evaluating the lefthand * expressions, plus the cost of probing the hashtable. We already @@ -6426,7 +6565,7 @@ get_expr_width(PlannerInfo *root, const Node *expr) * Estimate the storage space in bytes for a given number of tuples * of a given width (size in bytes). */ -static double +double relation_byte_size(double tuples, int width) { return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader)); @@ -6605,3 +6744,197 @@ compute_gather_rows(Path *path) return clamp_row_est(path->rows * get_parallel_divisor(path)); } + +/* + * compute_sort_output_sizes + * Estimate amount of memory and rows needed to hold a Sort operator's output + */ +static void +compute_sort_output_sizes(double input_tuples, int input_width, + double limit_tuples, + double *output_tuples, double *output_bytes) +{ + /* + * We want to be sure the cost of a sort is never estimated as zero, even + * if passed-in tuple count is zero. Besides, mustn't do log(0)... + */ + if (input_tuples < 2.0) + input_tuples = 2.0; + + /* Do we have a useful LIMIT? */ + if (limit_tuples > 0 && limit_tuples < input_tuples) + *output_tuples = limit_tuples; + else + *output_tuples = input_tuples; + + *output_bytes = relation_byte_size(*output_tuples, input_width); +} + +/* + * compute_agg_input_workmem + * Estimate memory (in KB) needed to hold a sort buffer for aggregate's input + * + * Some aggregates involve DISTINCT or ORDER BY, so they need to sort their + * input, before they can process it. We need one sort buffer per such + * aggregate, and this function returns that sort buffer's (estimated) size (in + * KB). + */ +int +compute_agg_input_workmem(double input_tuples, double input_width) +{ + /* Account for size of one buffer needed to sort the input. */ + return normalize_workmem(input_tuples * input_width); +} + +/* + * compute_agg_output_workmem + * Estimate amount of memory needed (in KB) to hold an aggregate's output + * + * In a Hash aggregate, we need space for the hash table that holds the + * aggregated data. + * + * Sort aggregates require output space only if they are part of a Grouping + * Sets chain: the first aggregate writes to its "sort_out" buffer, which the + * second aggregate uses as its "sort_in" buffer, and sorts. + * + * In the latter case, the "Path" code already costs the sort by calling + * cost_sort(), so it passes "cost_sort = false" to this function, to avoid + * double-counting. + */ +int +compute_agg_output_workmem(PlannerInfo *root, AggStrategy aggstrategy, + double numGroups, uint64 transitionSpace, + double input_tuples, double input_width, + bool cost_sort) +{ + /* Account for size of hash table to hold the output. */ + if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED) + { + double hashentrysize; + + hashentrysize = hash_agg_entry_size(list_length(root->aggtransinfos), + input_width, transitionSpace); + return normalize_workmem(numGroups * hashentrysize); + } + + /* Account for the size of the "sort_out" buffer. */ + if (cost_sort && aggstrategy == AGG_SORTED) + { + double output_tuples; /* ignored */ + double output_bytes; + + Assert(aggstrategy == AGG_SORTED); + + compute_sort_output_sizes(numGroups, input_width, + 0.0 /* limit_tuples */ , + &output_tuples, &output_bytes); + return normalize_workmem(output_bytes); + } + + return 0; +} + +/* + * compute_bitmap_workmem + * Estimate total working memory (in KB) needed by bitmapqual + * + * Although we don't fill in the workmem_est or rows fields on the bitmapqual's + * paths, we fill them in on the owning BitmapHeapPath. This function estimates + * the total work_mem needed by all BitmapOrPaths and IndexPaths inside + * bitmapqual. + */ +static double +compute_bitmap_workmem(RelOptInfo *baserel, Path *bitmapqual, + Cardinality max_ancestor_rows) +{ + double workmem = 0.0; + Cost cost; /* not used */ + Selectivity selec; + Cardinality plan_rows; + + /* How many rows will this node output? */ + cost_bitmap_tree_node(bitmapqual, &cost, &selec); + plan_rows = clamp_row_est(selec * baserel->tuples); + + /* + * At runtime, we'll reuse the left-most child's TID bitmap. Let that + * child that child know to request enough working memory to hold all its + * ancestors' results. + */ + max_ancestor_rows = Max(max_ancestor_rows, plan_rows); + + if (IsA(bitmapqual, BitmapAndPath)) + { + BitmapAndPath *apath = (BitmapAndPath *) bitmapqual; + ListCell *l; + + foreach(l, apath->bitmapquals) + { + workmem += + compute_bitmap_workmem(baserel, (Path *) lfirst(l), + foreach_current_index(l) == 0 ? + max_ancestor_rows : 0.0); + } + } + else if (IsA(bitmapqual, BitmapOrPath)) + { + BitmapOrPath *opath = (BitmapOrPath *) bitmapqual; + ListCell *l; + + foreach(l, opath->bitmapquals) + { + workmem += + compute_bitmap_workmem(baserel, (Path *) lfirst(l), + foreach_current_index(l) == 0 ? + max_ancestor_rows : 0.0); + } + } + else if (IsA(bitmapqual, IndexPath)) + { + /* Working memory needed for 1 TID bitmap. */ + workmem += + normalize_workmem(tbm_calculate_bytes(max_ancestor_rows)); + } + + return workmem; +} + +/* + * normalize_workmem + * Convert a double, "bytes" working-memory estimate to an int, "KB" value + * + * Normalizes to a minimum of 64 (KB), rounding up to the nearest whole KB. + */ +int +normalize_workmem(double nbytes) +{ + double workmem; + + /* + * We'll assign working-memory to SQL operators in 1 KB increments, so + * round up to the next whole KB. + */ + workmem = ceil(nbytes / 1024.0); + + /* + * Although some components can probably work with < 64 KB of working + * memory, PostgreSQL has imposed a hard minimum of 64 KB on the + * "work_mem" GUC, for a long time; so, by now, some components probably + * rely on this minimum, implicitly, and would fail if we tried to assign + * them < 64 KB. + * + * Perhaps this minimum can be relaxed, in the future; but memory sizes + * keep increasing, and right now the minimum of 64 KB = 1.6 percent of + * the default "work_mem" of 4 MB. + * + * So, even with this (overly?) cautious normalization, with the default + * GUC settings, we can still achieve a working-memory reduction of + * 64-to-1. + */ + workmem = Max((double) 64, workmem); + + /* And clamp to MAX_KILOBYTES. */ + workmem = Min(workmem, (double) MAX_KILOBYTES); + + return (int) workmem; +} diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 816a2b2a576..973b86371ef 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -130,6 +130,7 @@ static BitmapHeapScan *create_bitmap_scan_plan(PlannerInfo *root, BitmapHeapPath *best_path, List *tlist, List *scan_clauses); static Plan *create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, + Cardinality max_ancestor_rows, List **qual, List **indexqual, List **indexECs); static void bitmap_subplan_mark_shared(Plan *plan); static TidScan *create_tidscan_plan(PlannerInfo *root, TidPath *best_path, @@ -1853,6 +1854,7 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags) groupCollations, NIL, NIL, + 0, /* numSorts */ best_path->path.rows, 0, subplan); @@ -1911,6 +1913,15 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags) /* Copy cost data from Path to Plan */ copy_generic_path_info(plan, &best_path->path); + if (IsA(plan, Unique)) + { + /* + * We assigned "workmem" to the Sort subplan. Clear it from the top- + * level Unique node, to avoid double-counting. + */ + plan->workmem = 0; + } + return plan; } @@ -2228,6 +2239,13 @@ create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path, copy_generic_path_info(&plan->sort.plan, (Path *) best_path); + /* + * IncrementalSort creates two sort buffers, which the Path's "workmem" + * estimate combined into a single value. Split it into two now. + */ + plan->sort.plan.workmem = + normalize_workmem(best_path->spath.path.workmem / 2); + return plan; } @@ -2333,12 +2351,29 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path) subplan->targetlist), NIL, NIL, + best_path->numSorts, best_path->numGroups, best_path->transitionSpace, subplan); copy_generic_path_info(&plan->plan, (Path *) best_path); + /* + * Replace the overall workmem estimate with that we copied from the Path + * with finer-grained estimates. + */ + plan->plan.workmem = + compute_agg_output_workmem(root, plan->aggstrategy, plan->numGroups, + plan->transitionSpace, subplan->plan_rows, + subplan->plan_width, false /* cost_sort */ ); + + /* Also include estimated memory needed to sort the input: */ + if (plan->numSorts > 0) + { + plan->sortWorkMem = compute_agg_input_workmem(subplan->plan_rows, + subplan->plan_width); + } + return plan; } @@ -2457,8 +2492,9 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) RollupData *rollup = lfirst(lc); AttrNumber *new_grpColIdx; Plan *sort_plan = NULL; - Plan *agg_plan; + Agg *agg_plan; AggStrategy strat; + bool cost_sort; new_grpColIdx = remap_groupColIdx(root, rollup->groupClause); @@ -2480,19 +2516,20 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) else strat = AGG_SORTED; - agg_plan = (Plan *) make_agg(NIL, - NIL, - strat, - AGGSPLIT_SIMPLE, - list_length((List *) linitial(rollup->gsets)), - new_grpColIdx, - extract_grouping_ops(rollup->groupClause), - extract_grouping_collations(rollup->groupClause, subplan->targetlist), - rollup->gsets, - NIL, - rollup->numGroups, - best_path->transitionSpace, - sort_plan); + agg_plan = make_agg(NIL, + NIL, + strat, + AGGSPLIT_SIMPLE, + list_length((List *) linitial(rollup->gsets)), + new_grpColIdx, + extract_grouping_ops(rollup->groupClause), + extract_grouping_collations(rollup->groupClause, subplan->targetlist), + rollup->gsets, + NIL, + best_path->numSorts, + rollup->numGroups, + best_path->transitionSpace, + sort_plan); /* * Remove stuff we don't need to avoid bloating debug output. @@ -2503,7 +2540,36 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) sort_plan->lefttree = NULL; } - chain = lappend(chain, agg_plan); + /* + * If we're an AGG_SORTED, but not the last, we need to cost + * working memory needed to produce our "sort_out" buffer. + */ + cost_sort = foreach_current_index(lc) < list_length(rollups) - 1; + + /* + * Although this side node doesn't need accurate cost estimates, + * it does need an accurate *memory* estimate, since we'll use + * that estimate to distribute working memory to this side node, + * at runtime. + */ + + /* Estimated memory needed to hold the output: */ + agg_plan->plan.workmem = + compute_agg_output_workmem(root, agg_plan->aggstrategy, + agg_plan->numGroups, + agg_plan->transitionSpace, + subplan->plan_rows, + subplan->plan_width, cost_sort); + + /* Also include estimated memory needed to sort the input: */ + if (agg_plan->numSorts > 0) + { + agg_plan->sortWorkMem = + compute_agg_input_workmem(subplan->plan_rows, + subplan->plan_width); + } + + chain = lappend(chain, (Plan *) agg_plan); } } @@ -2514,6 +2580,7 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) RollupData *rollup = linitial(rollups); AttrNumber *top_grpColIdx; int numGroupCols; + bool cost_sort; top_grpColIdx = remap_groupColIdx(root, rollup->groupClause); @@ -2529,12 +2596,37 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) extract_grouping_collations(rollup->groupClause, subplan->targetlist), rollup->gsets, chain, + best_path->numSorts, rollup->numGroups, best_path->transitionSpace, subplan); /* Copy cost data from Path to Plan */ copy_generic_path_info(&plan->plan, &best_path->path); + + /* + * If we're an AGG_SORTED, but not the last, we need to cost working + * memory needed to produce our "sort_out" buffer. + */ + cost_sort = list_length(rollups) > 1; + + /* + * Replace the overall workmem estimate that we copied from the Path + * with finer-grained estimates. + */ + plan->plan.workmem = + compute_agg_output_workmem(root, plan->aggstrategy, plan->numGroups, + plan->transitionSpace, + subplan->plan_rows, subplan->plan_width, + cost_sort); + + /* Also include estimated memory needed to sort the input: */ + if (plan->numSorts > 0) + { + plan->sortWorkMem = + compute_agg_input_workmem(subplan->plan_rows, + subplan->plan_width); + } } return (Plan *) plan; @@ -2783,6 +2875,38 @@ create_recursiveunion_plan(PlannerInfo *root, RecursiveUnionPath *best_path) copy_generic_path_info(&plan->plan, (Path *) best_path); + /* + * Replace our overall "workmem" estimate with estimates at finer + * granularity. + */ + + /* + * Include memory for working and intermediate tables. Since we'll + * repeatedly swap the two tables, use the larger of the two as our + * working- memory estimate. + * + * NOTE: The Path's "workmem" estimate is for the whole Path, but the + * Plan's "workmem" estimates are *per data structure*. So, this value is + * half of the corresponding Path's value. + */ + plan->plan.workmem = + normalize_workmem( + Max(relation_byte_size(leftplan->plan_rows, + leftplan->plan_width), + relation_byte_size(rightplan->plan_rows, + rightplan->plan_width))); + + if (plan->numCols > 0) + { + /* Also include memory for hash table. */ + Size entrysize; + + entrysize = sizeof(TupleHashEntryData) + plan->plan.plan_width; + + plan->hashWorkMem = + normalize_workmem(plan->numGroups * entrysize); + } + return plan; } @@ -3223,6 +3347,7 @@ create_bitmap_scan_plan(PlannerInfo *root, /* Process the bitmapqual tree into a Plan tree and qual lists */ bitmapqualplan = create_bitmap_subplan(root, best_path->bitmapqual, + 0.0 /* max_ancestor_rows */ , &bitmapqualorig, &indexquals, &indexECs); @@ -3309,6 +3434,12 @@ create_bitmap_scan_plan(PlannerInfo *root, copy_generic_path_info(&scan_plan->scan.plan, &best_path->path); + /* + * We assigned "workmem" to the "bitmapqualplan" subplan. Clear it from + * the top-level BitmapHeapScan node, to avoid double-counting. + */ + scan_plan->scan.plan.workmem = 0; + return scan_plan; } @@ -3334,9 +3465,24 @@ create_bitmap_scan_plan(PlannerInfo *root, */ static Plan * create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, + Cardinality max_ancestor_rows, List **qual, List **indexqual, List **indexECs) { Plan *plan; + Cost cost; /* not used */ + Selectivity selec; + Cardinality plan_rows; + + /* How many rows will this node output? */ + cost_bitmap_tree_node(bitmapqual, &cost, &selec); + plan_rows = clamp_row_est(selec * bitmapqual->parent->tuples); + + /* + * At runtime, we'll reuse the left-most child's TID bitmap. Let that + * child that child know to request enough working memory to hold all its + * ancestors' results. + */ + max_ancestor_rows = Max(max_ancestor_rows, plan_rows); if (IsA(bitmapqual, BitmapAndPath)) { @@ -3362,6 +3508,8 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, List *subindexEC; subplan = create_bitmap_subplan(root, (Path *) lfirst(l), + foreach_current_index(l) == 0 ? + max_ancestor_rows : 0.0, &subqual, &subindexqual, &subindexEC); subplans = lappend(subplans, subplan); @@ -3373,8 +3521,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, plan = (Plan *) make_bitmap_and(subplans); plan->startup_cost = apath->path.startup_cost; plan->total_cost = apath->path.total_cost; - plan->plan_rows = - clamp_row_est(apath->bitmapselectivity * apath->path.parent->tuples); + plan->plan_rows = plan_rows; plan->plan_width = 0; /* meaningless */ plan->parallel_aware = false; plan->parallel_safe = apath->path.parallel_safe; @@ -3409,6 +3556,8 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, List *subindexEC; subplan = create_bitmap_subplan(root, (Path *) lfirst(l), + foreach_current_index(l) == 0 ? + max_ancestor_rows : 0.0, &subqual, &subindexqual, &subindexEC); subplans = lappend(subplans, subplan); @@ -3437,8 +3586,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, plan = (Plan *) make_bitmap_or(subplans); plan->startup_cost = opath->path.startup_cost; plan->total_cost = opath->path.total_cost; - plan->plan_rows = - clamp_row_est(opath->bitmapselectivity * opath->path.parent->tuples); + plan->plan_rows = plan_rows; plan->plan_width = 0; /* meaningless */ plan->parallel_aware = false; plan->parallel_safe = opath->path.parallel_safe; @@ -3484,8 +3632,9 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, /* and set its cost/width fields appropriately */ plan->startup_cost = 0.0; plan->total_cost = ipath->indextotalcost; - plan->plan_rows = - clamp_row_est(ipath->indexselectivity * ipath->path.parent->tuples); + plan->workmem = + normalize_workmem(tbm_calculate_bytes(max_ancestor_rows)); + plan->plan_rows = plan_rows; plan->plan_width = 0; /* meaningless */ plan->parallel_aware = false; plan->parallel_safe = ipath->path.parallel_safe; @@ -3796,6 +3945,14 @@ create_functionscan_plan(PlannerInfo *root, Path *best_path, copy_generic_path_info(&scan_plan->scan.plan, best_path); + /* + * Replace the path's total working-memory estimate with a per-function + * estimate. + */ + scan_plan->scan.plan.workmem = + normalize_workmem(relation_byte_size(scan_plan->scan.plan.plan_rows, + scan_plan->scan.plan.plan_width)); + return scan_plan; } @@ -4615,6 +4772,9 @@ create_mergejoin_plan(PlannerInfo *root, */ copy_plan_costsize(matplan, inner_plan); matplan->total_cost += cpu_operator_cost * matplan->plan_rows; + matplan->workmem = + normalize_workmem(relation_byte_size(matplan->plan_rows, + matplan->plan_width)); inner_plan = matplan; } @@ -4961,6 +5121,10 @@ create_hashjoin_plan(PlannerInfo *root, copy_generic_path_info(&join_plan->join.plan, &best_path->jpath.path); + /* Display "workmem" on the Hash subnode, not its parent HashJoin node. */ + hash_plan->plan.workmem = join_plan->join.plan.workmem; + join_plan->join.plan.workmem = 0; + return join_plan; } @@ -5458,6 +5622,7 @@ copy_generic_path_info(Plan *dest, Path *src) dest->disabled_nodes = src->disabled_nodes; dest->startup_cost = src->startup_cost; dest->total_cost = src->total_cost; + dest->workmem = (int) Min(src->workmem, (double) MAX_KILOBYTES); dest->plan_rows = src->rows; dest->plan_width = src->pathtarget->width; dest->parallel_aware = src->parallel_aware; @@ -5474,6 +5639,7 @@ copy_plan_costsize(Plan *dest, Plan *src) dest->disabled_nodes = src->disabled_nodes; dest->startup_cost = src->startup_cost; dest->total_cost = src->total_cost; + dest->workmem = src->workmem; dest->plan_rows = src->plan_rows; dest->plan_width = src->plan_width; /* Assume the inserted node is not parallel-aware. */ @@ -5509,6 +5675,7 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples) limit_tuples); plan->plan.startup_cost = sort_path.startup_cost; plan->plan.total_cost = sort_path.total_cost; + plan->plan.workmem = (int) Min(sort_path.workmem, (double) MAX_KILOBYTES); plan->plan.plan_rows = lefttree->plan_rows; plan->plan.plan_width = lefttree->plan_width; plan->plan.parallel_aware = false; @@ -5540,6 +5707,8 @@ label_incrementalsort_with_costsize(PlannerInfo *root, IncrementalSort *plan, limit_tuples); plan->sort.plan.startup_cost = sort_path.startup_cost; plan->sort.plan.total_cost = sort_path.total_cost; + plan->sort.plan.workmem = (int) Min(sort_path.workmem, + (double) MAX_KILOBYTES); plan->sort.plan.plan_rows = lefttree->plan_rows; plan->sort.plan.plan_width = lefttree->plan_width; plan->sort.plan.parallel_aware = false; @@ -6673,7 +6842,7 @@ Agg * make_agg(List *tlist, List *qual, AggStrategy aggstrategy, AggSplit aggsplit, int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators, Oid *grpCollations, - List *groupingSets, List *chain, double dNumGroups, + List *groupingSets, List *chain, int numSorts, double dNumGroups, Size transitionSpace, Plan *lefttree) { Agg *node = makeNode(Agg); @@ -6689,6 +6858,8 @@ make_agg(List *tlist, List *qual, node->grpColIdx = grpColIdx; node->grpOperators = grpOperators; node->grpCollations = grpCollations; + node->numSorts = numSorts; + node->sortWorkMem = 0; /* caller will fill this */ node->numGroups = numGroups; node->transitionSpace = transitionSpace; node->aggParams = NULL; /* SS_finalize_plan() will fill this */ diff --git a/src/backend/optimizer/prep/prepagg.c b/src/backend/optimizer/prep/prepagg.c index c0a2f04a8c3..3eba364484d 100644 --- a/src/backend/optimizer/prep/prepagg.c +++ b/src/backend/optimizer/prep/prepagg.c @@ -691,5 +691,17 @@ get_agg_clause_costs(PlannerInfo *root, AggSplit aggsplit, AggClauseCosts *costs costs->finalCost.startup += argcosts.startup; costs->finalCost.per_tuple += argcosts.per_tuple; } + + /* + * How many aggrefs need to sort their input? (Each such aggref gets + * its own sort buffer. The logic here MUST match the corresponding + * logic in function build_pertrans_for_aggref().) + */ + if (!AGGKIND_IS_ORDERED_SET(aggref->aggkind) && + !aggref->aggpresorted && + (aggref->aggdistinct || aggref->aggorder)) + { + ++costs->numSorts; + } } } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 93e73cb44db..c533bfb9a58 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1709,6 +1709,13 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.total_cost = subpath->total_cost + cpu_tuple_cost; pathnode->path.rows = subpath->rows; + /* + * For now, set workmem at hash memory limit. Function + * cost_memoize_rescan() will adjust this field, same as it does for field + * "est_entries". + */ + pathnode->path.workmem = normalize_workmem(get_hash_memory_limit()); + return pathnode; } @@ -1937,12 +1944,14 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.disabled_nodes = agg_path.disabled_nodes; pathnode->path.startup_cost = agg_path.startup_cost; pathnode->path.total_cost = agg_path.total_cost; + pathnode->path.workmem = agg_path.workmem; } else { pathnode->path.disabled_nodes = sort_path.disabled_nodes; pathnode->path.startup_cost = sort_path.startup_cost; pathnode->path.total_cost = sort_path.total_cost; + pathnode->path.workmem = sort_path.workmem; } rel->cheapest_unique_path = (Path *) pathnode; @@ -2289,6 +2298,13 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel, /* Cost is the same as for a regular CTE scan */ cost_ctescan(pathnode, root, rel, pathnode->param_info); + /* + * But working memory used is 0, since the worktable scan doesn't create a + * tuplestore -- it just reuses a tuplestore already created by a + * recursive union. + */ + pathnode->workmem = 0; + return pathnode; } @@ -3283,6 +3299,7 @@ create_agg_path(PlannerInfo *root, pathnode->aggstrategy = aggstrategy; pathnode->aggsplit = aggsplit; + pathnode->numSorts = aggcosts ? aggcosts->numSorts : 0; pathnode->numGroups = numGroups; pathnode->transitionSpace = aggcosts ? aggcosts->transitionSpace : 0; pathnode->groupClause = groupClause; @@ -3333,6 +3350,8 @@ create_groupingsets_path(PlannerInfo *root, ListCell *lc; bool is_first = true; bool is_first_sort = true; + int num_sort_nodes = 0; + double max_sort_workmem = 0.0; /* The topmost generated Plan node will be an Agg */ pathnode->path.pathtype = T_Agg; @@ -3369,6 +3388,7 @@ create_groupingsets_path(PlannerInfo *root, pathnode->path.pathkeys = NIL; pathnode->aggstrategy = aggstrategy; + pathnode->numSorts = agg_costs ? agg_costs->numSorts : 0; pathnode->rollups = rollups; pathnode->qual = having_qual; pathnode->transitionSpace = agg_costs ? agg_costs->transitionSpace : 0; @@ -3432,6 +3452,8 @@ create_groupingsets_path(PlannerInfo *root, subpath->pathtarget->width); if (!rollup->is_hashed) is_first_sort = false; + + pathnode->path.workmem += agg_path.workmem; } else { @@ -3444,6 +3466,12 @@ create_groupingsets_path(PlannerInfo *root, work_mem, -1.0); + /* + * We costed sorting the previous "sort" rollup's "sort_out" + * buffer. How much memory did it need? + */ + max_sort_workmem = Max(max_sort_workmem, sort_path.workmem); + /* Account for cost of aggregation */ cost_agg(&agg_path, root, @@ -3457,12 +3485,17 @@ create_groupingsets_path(PlannerInfo *root, sort_path.total_cost, sort_path.rows, subpath->pathtarget->width); + + pathnode->path.workmem += agg_path.workmem; } pathnode->path.disabled_nodes += agg_path.disabled_nodes; pathnode->path.total_cost += agg_path.total_cost; pathnode->path.rows += agg_path.rows; } + + if (!rollup->is_hashed) + ++num_sort_nodes; } /* add tlist eval cost for each output row */ @@ -3470,6 +3503,17 @@ create_groupingsets_path(PlannerInfo *root, pathnode->path.total_cost += target->cost.startup + target->cost.per_tuple * pathnode->path.rows; + /* + * Include working memory needed to sort agg output. If there's only 1 + * sort rollup, then we don't need any memory. If there are 2 sort + * rollups, we need enough memory for 1 sort buffer. If there are >= 3 + * sort rollups, we need only 2 sort buffers, since we're + * double-buffering. + */ + pathnode->path.workmem += num_sort_nodes > 2 ? + max_sort_workmem * 2.0 : + max_sort_workmem; + return pathnode; } @@ -3619,7 +3663,8 @@ create_windowagg_path(PlannerInfo *root, subpath->disabled_nodes, subpath->startup_cost, subpath->total_cost, - subpath->rows); + subpath->rows, + subpath->pathtarget->width); /* add tlist eval cost for each output row */ pathnode->path.startup_cost += target->cost.startup; @@ -3744,7 +3789,11 @@ create_setop_path(PlannerInfo *root, MAXALIGN(SizeofMinimalTupleHeader); if (hashentrysize * numGroups > get_hash_memory_limit()) pathnode->path.disabled_nodes++; + + pathnode->path.workmem = + normalize_workmem(numGroups * hashentrysize); } + pathnode->path.rows = outputRows; return pathnode; @@ -3795,7 +3844,7 @@ create_recursiveunion_path(PlannerInfo *root, pathnode->wtParam = wtParam; pathnode->numGroups = numGroups; - cost_recursive_union(&pathnode->path, leftpath, rightpath); + cost_recursive_union(pathnode, leftpath, rightpath); return pathnode; } diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h index e4e9e0d1de1..6cd9bffbee5 100644 --- a/src/include/executor/nodeHash.h +++ b/src/include/executor/nodeHash.h @@ -63,7 +63,8 @@ extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, size_t *total_space_allowed, int *numbuckets, int *numbatches, - int *num_skew_mcvs); + int *num_skew_mcvs, + int *workmem); extern int ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue); extern void ExecHashEstimate(HashState *node, ParallelContext *pcxt); extern void ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt); diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index fbf05322c75..2285544396d 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -60,6 +60,7 @@ typedef struct AggClauseCosts QualCost transCost; /* total per-input-row execution costs */ QualCost finalCost; /* total per-aggregated-row costs */ Size transitionSpace; /* space for pass-by-ref transition data */ + int numSorts; /* # of required input-sort buffers */ } AggClauseCosts; /* @@ -1696,6 +1697,7 @@ typedef struct Path int disabled_nodes; /* count of disabled nodes */ Cost startup_cost; /* cost expended before fetching any tuples */ Cost total_cost; /* total cost (assuming all tuples fetched) */ + Cost workmem; /* estimated work_mem (in KB) */ /* sort ordering of path's output; a List of PathKey nodes; see above */ List *pathkeys; @@ -2290,6 +2292,7 @@ typedef struct AggPath Path *subpath; /* path representing input source */ AggStrategy aggstrategy; /* basic strategy, see nodes.h */ AggSplit aggsplit; /* agg-splitting mode, see nodes.h */ + int numSorts; /* number of inputs that require sorting */ Cardinality numGroups; /* estimated number of groups in input */ uint64 transitionSpace; /* for pass-by-ref transition data */ List *groupClause; /* a list of SortGroupClause's */ @@ -2331,6 +2334,7 @@ typedef struct GroupingSetsPath Path path; Path *subpath; /* path representing input source */ AggStrategy aggstrategy; /* basic strategy */ + int numSorts; /* number of inputs that require sorting */ List *rollups; /* list of RollupData */ List *qual; /* quals (HAVING quals), if any */ uint64 transitionSpace; /* for pass-by-ref transition data */ @@ -3374,6 +3378,7 @@ typedef struct JoinCostWorkspace /* Fields below here should be treated as private to costsize.c */ Cost run_cost; /* non-startup cost components */ + Cost workmem; /* estimated work_mem (in KB) */ /* private for cost_nestloop code */ Cost inner_run_cost; /* also used by cost_mergejoin code */ diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 396f7881420..e2a7a12d2a3 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -167,6 +167,8 @@ typedef struct Plan Cost startup_cost; /* total cost (assuming all tuples fetched) */ Cost total_cost; + /* estimated working memory (in KB) */ + int workmem; /* estimated work_mem (in KB) */ /* (runtime) working memory limit (in KB) */ int workmem_limit; @@ -432,6 +434,8 @@ typedef struct RecursiveUnion /* estimated number of groups in input */ long numGroups; + /* estimated work_mem for hash table (in KB) */ + int hashWorkMem; /* work_mem reserved for hash table */ int hashWorkMemLimit; } RecursiveUnion; @@ -1155,7 +1159,8 @@ typedef struct Agg /* number of inputs that require sorting */ int numSorts; - + /* estimated work_mem needed to sort each input (in KB) */ + int sortWorkMem; /* work_mem limit to sort one input (in KB) */ int sortWorkMemLimit; diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index b932168237c..5e2e804f455 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -1109,6 +1109,9 @@ typedef struct SubPlan /* Estimated execution costs: */ Cost startup_cost; /* one-time setup cost */ Cost per_call_cost; /* cost for each subplan evaluation */ + /* Estimated working memory (in KB): */ + int hashtab_workmem; /* estimate for hashtable */ + int hashnul_workmem; /* estimate for hashnull */ /* (Runtime) working-memory limits (in KB): */ int hashtab_workmem_limit; /* limit for hashtable */ int hashnul_workmem_limit; /* limit for hashnulls */ diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index e185635c10b..b5c98a39af7 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -108,6 +108,7 @@ extern void tbm_end_shared_iterate(TBMSharedIterator *iterator); extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp); extern int tbm_calculate_entries(Size maxbytes); +extern double tbm_calculate_bytes(double maxentries); extern TBMIterator tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp); diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 3aa3c16e442..737c553a409 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -106,7 +106,7 @@ extern void cost_namedtuplestorescan(Path *path, PlannerInfo *root, RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_resultscan(Path *path, PlannerInfo *root, RelOptInfo *baserel, ParamPathInfo *param_info); -extern void cost_recursive_union(Path *runion, Path *nrterm, Path *rterm); +extern void cost_recursive_union(RecursiveUnionPath *runion, Path *nrterm, Path *rterm); extern void cost_sort(Path *path, PlannerInfo *root, List *pathkeys, int disabled_nodes, Cost input_cost, double tuples, int width, @@ -139,7 +139,7 @@ extern void cost_windowagg(Path *path, PlannerInfo *root, List *windowFuncs, WindowClause *winclause, int input_disabled_nodes, Cost input_startup_cost, Cost input_total_cost, - double input_tuples); + double input_tuples, int width); extern void cost_group(Path *path, PlannerInfo *root, int numGroupCols, double numGroups, List *quals, @@ -217,9 +217,17 @@ extern void set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *re extern void set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern void set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern PathTarget *set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target); +extern double relation_byte_size(double tuples, int width); extern double compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, double loop_count, Cost *cost_p, double *tuples_p); extern double compute_gather_rows(Path *path); +extern int compute_agg_input_workmem(double input_tuples, double input_width); +extern int compute_agg_output_workmem(PlannerInfo *root, + AggStrategy aggstrategy, + double numGroups, uint64 transitionSpace, + double input_tuples, double input_width, + bool cost_sort); +extern int normalize_workmem(double nbytes); #endif /* COST_H */ diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 5a930199611..cf3694a744f 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -55,7 +55,7 @@ extern Sort *make_sort_from_sortclauses(List *sortcls, Plan *lefttree); extern Agg *make_agg(List *tlist, List *qual, AggStrategy aggstrategy, AggSplit aggsplit, int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators, Oid *grpCollations, - List *groupingSets, List *chain, double dNumGroups, + List *groupingSets, List *chain, int numSorts, double dNumGroups, Size transitionSpace, Plan *lefttree); extern Limit *make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, LimitOption limitOption, int uniqNumCols, -- 2.47.1