From 099366618d3f15f69bd9542d7d31f82148889a11 Mon Sep 17 00:00:00 2001 From: James Hunter Date: Fri, 24 Jan 2025 20:48:39 +0000 Subject: [PATCH 1/4] EXPLAIN now takes "work_mem" option, to display estimated working memory This commit adds option "WORK_MEM" to the existing EXPLAIN command. When set to ON, the EXPLAIN output will include text of the form "(work_mem= 5.67 kB)" on every plan node that uses working memory. The output is an *estimate*, typically based on the estimated number of input rows for that plan node. Normalize "working-memory" estimates to a minimum of 64 KB The minimum possible value of the "work_mem" GUC is 64 KB. This commit changes the tracking + output for "EXPLAIN (WORK_MEM ON)" so that it reports a minimum of 64 KB for every node or subcomponent that requires working memory. It also rounds "nbytes" up to the nearest whole KB (= ceil()), and changes the EXPLAIN output to report a whole integer, rather than to two decimal places. Note that 1 KB = 1.6 percent of the 64 KB minimum. To allow for future optimizers to make decisions at Path time, this commit aggregates the Path's total working memory onto the Path's "workmem" field. To allow the executor to restrict memory usage by individual data structure, it then breaks that total working memory into per-data structure working memory, on the Plan. Also adds a "Total Working Memory" line at the bottom of the plan output. --- src/backend/commands/explain.c | 207 ++++++++ src/backend/executor/nodeHash.c | 15 +- src/backend/nodes/tidbitmap.c | 18 + src/backend/optimizer/path/costsize.c | 387 ++++++++++++++- src/backend/optimizer/plan/createplan.c | 215 +++++++- src/backend/optimizer/prep/prepagg.c | 12 + src/backend/optimizer/util/pathnode.c | 53 +- src/include/commands/explain.h | 3 + src/include/executor/nodeHash.h | 3 +- src/include/nodes/pathnodes.h | 11 + src/include/nodes/plannodes.h | 11 + src/include/nodes/primnodes.h | 2 + src/include/nodes/tidbitmap.h | 1 + src/include/optimizer/cost.h | 12 +- src/include/optimizer/planmain.h | 2 +- src/test/regress/expected/workmem.out | 631 ++++++++++++++++++++++++ src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/workmem.sql | 303 ++++++++++++ 18 files changed, 1828 insertions(+), 60 deletions(-) create mode 100644 src/test/regress/expected/workmem.out create mode 100644 src/test/regress/sql/workmem.sql diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index c0d614866a9..e09d7f868c9 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -180,6 +180,8 @@ static void ExplainJSONLineEnding(ExplainState *es); static void ExplainYAMLLineStarting(ExplainState *es); static void escape_yaml(StringInfo buf, const char *str); static SerializeMetrics GetSerializationMetrics(DestReceiver *dest); +static void compute_subplan_workmem(List *plans, double *workmem); +static void compute_agg_workmem(Agg *agg, double *workmem); @@ -235,6 +237,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt, } else if (strcmp(opt->defname, "memory") == 0) es->memory = defGetBoolean(opt); + else if (strcmp(opt->defname, "work_mem") == 0) + es->work_mem = defGetBoolean(opt); else if (strcmp(opt->defname, "serialize") == 0) { if (opt->arg) @@ -835,6 +839,12 @@ ExplainOnePlan(PlannedStmt *plannedstmt, CachedPlan *cplan, ExplainPropertyFloat("Execution Time", "ms", 1000.0 * totaltime, 3, es); + if (es->work_mem) + { + ExplainPropertyFloat("Total Working Memory", "kB", + es->total_workmem, 0, es); + } + ExplainCloseGroup("Query", NULL, true, es); } @@ -1970,6 +1980,77 @@ ExplainNode(PlanState *planstate, List *ancestors, } } + if (es->work_mem) + { + double plan_workmem = 0.0; + + /* + * Include working memory used by this Plan's SubPlan objects, whether + * they are included on the Plan's initPlan or subPlan lists. + */ + compute_subplan_workmem(planstate->initPlan, &plan_workmem); + compute_subplan_workmem(planstate->subPlan, &plan_workmem); + + /* Include working memory used by this Plan, itself. */ + switch (nodeTag(plan)) + { + case T_Agg: + compute_agg_workmem((Agg *) plan, &plan_workmem); + break; + case T_FunctionScan: + { + FunctionScan *fscan = (FunctionScan *) plan; + + plan_workmem += (double) plan->workmem * + list_length(fscan->functions); + break; + } + case T_IncrementalSort: + + /* + * IncrementalSort creates two Tuplestores, each of + * (estimated) size workmem. + */ + plan_workmem = (double) plan->workmem * 2; + break; + case T_RecursiveUnion: + { + RecursiveUnion *runion = (RecursiveUnion *) plan; + + /* + * RecursiveUnion creates two Tuplestores, each of + * (estimated) size workmem, plus (possibly) a hash table + * of size hashWorkMem. + */ + plan_workmem += (double) plan->workmem * 2 + + runion->hashWorkMem; + break; + } + default: + if (plan->workmem > 0) + plan_workmem += plan->workmem; + break; + } + + /* + * Every parallel worker (plus the leader) gets its own copy of + * working memory. + */ + plan_workmem *= (1 + es->num_workers); + + es->total_workmem += plan_workmem; + + if (plan_workmem > 0.0) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfo(es->str, " (work_mem=%.0f kB)", + plan_workmem); + else + ExplainPropertyFloat("Working Memory", "kB", + plan_workmem, 0, es); + } + } + /* * We have to forcibly clean up the instrumentation state because we * haven't done ExecutorEnd yet. This is pretty grotty ... @@ -2536,6 +2617,20 @@ ExplainNode(PlanState *planstate, List *ancestors, if (planstate->initPlan) ExplainSubPlans(planstate->initPlan, ancestors, "InitPlan", es); + if (nodeTag(plan) == T_Gather || nodeTag(plan) == T_GatherMerge) + { + /* + * Other than initPlan-s, every node below us gets the # of planned + * workers we specified. + */ + Assert(es->num_workers == 0); + + if (nodeTag(plan) == T_Gather) + es->num_workers = ((Gather *) plan)->num_workers; + else + es->num_workers = ((GatherMerge *) plan)->num_workers; + } + /* lefttree */ if (outerPlanState(planstate)) ExplainNode(outerPlanState(planstate), ancestors, @@ -2592,6 +2687,12 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainCloseGroup("Plans", "Plans", false, es); } + if (nodeTag(plan) == T_Gather || nodeTag(plan) == T_GatherMerge) + { + /* End of parallel sub-tree. */ + es->num_workers = 0; + } + /* in text format, undo whatever indentation we added */ if (es->format == EXPLAIN_FORMAT_TEXT) es->indent = save_indent; @@ -5952,3 +6053,109 @@ GetSerializationMetrics(DestReceiver *dest) return empty; } + +/* + * compute_subplan_work_mem - compute total workmem for a SubPlan object + * + * If a SubPlan object uses a hash table, then that hash table needs working + * memory. We display that working memory on the owning Plan. This function + * increments work_mem counters to include the SubPlan's working-memory. + */ +static void +compute_subplan_workmem(List *plans, double *workmem) +{ + foreach_node(SubPlanState, sps, plans) + { + SubPlan *sp = sps->subplan; + + if (sp->hashtab_workmem > 0) + *workmem += sp->hashtab_workmem; + + if (sp->hashnul_workmem > 0) + *workmem += sp->hashnul_workmem; + } +} + +/* Compute an Agg's working memory estimate. */ +typedef struct AggWorkMem +{ + double input_sort_workmem; + + double output_hash_workmem; + + int num_sort_nodes; + double max_output_sort_workmem; +} AggWorkMem; + +static void +compute_agg_workmem_node(Agg *agg, AggWorkMem * mem) +{ + /* Record memory used for input sort buffers. */ + mem->input_sort_workmem += (double) agg->numSorts * agg->sortWorkMem; + + /* Record memory used for output data structures. */ + switch (agg->aggstrategy) + { + case AGG_SORTED: + + /* We'll have at most two sort buffers alive, at any time. */ + mem->max_output_sort_workmem = + Max(mem->max_output_sort_workmem, agg->plan.workmem); + + ++mem->num_sort_nodes; + break; + case AGG_HASHED: + case AGG_MIXED: + + /* + * All hash tables created by "hash" phases are kept for the + * lifetime of the Agg. + */ + mem->output_hash_workmem += agg->plan.workmem; + break; + default: + + /* + * "Plain" phases don't use working memory (they output a single + * aggregated tuple). + */ + break; + } +} + +/* + * compute_agg_workmem - compute total workmem for an Agg node + * + * An Agg node might point to a chain of additional Agg nodes. When we explain + * the plan, we display only the first, "main" Agg node. However, to make life + * easier for the executor, we stored the estimated working memory ("workmem") + * on each individual Agg node. + * + * This function returns the combined workmem, so that we can display this + * value on the main Agg node. + */ +static void +compute_agg_workmem(Agg *agg, double *workmem) +{ + AggWorkMem mem; + ListCell *lc; + + memset(&mem, 0, sizeof(mem)); + + compute_agg_workmem_node(agg, &mem); + + /* Also include the chain of GROUPING SETS aggs. */ + foreach(lc, agg->chain) + { + Agg *aggnode = (Agg *) lfirst(lc); + + compute_agg_workmem_node(aggnode, &mem); + } + + *workmem = mem.input_sort_workmem + mem.output_hash_workmem; + + /* We'll have at most two sort buffers alive, at any time. */ + *workmem += mem.num_sort_nodes > 2 ? + mem.max_output_sort_workmem * 2.0 : + mem.max_output_sort_workmem; +} diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 8d2201ab67f..d54cfe5fdbe 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -35,6 +35,7 @@ #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "port/pg_bitutils.h" #include "utils/dynahash.h" #include "utils/lsyscache.h" @@ -452,6 +453,7 @@ ExecHashTableCreate(HashState *state) int nbuckets; int nbatch; double rows; + int workmem; /* ignored */ int num_skew_mcvs; int log2_nbuckets; MemoryContext oldcxt; @@ -477,7 +479,7 @@ ExecHashTableCreate(HashState *state) state->parallel_state != NULL ? state->parallel_state->nparticipants - 1 : 0, &space_allowed, - &nbuckets, &nbatch, &num_skew_mcvs); + &nbuckets, &nbatch, &num_skew_mcvs, &workmem); /* nbuckets must be a power of 2 */ log2_nbuckets = my_log2(nbuckets); @@ -661,7 +663,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, size_t *space_allowed, int *numbuckets, int *numbatches, - int *num_skew_mcvs) + int *num_skew_mcvs, + int *workmem) { int tupsize; double inner_rel_bytes; @@ -792,6 +795,9 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, * the required bucket headers, we will need multiple batches. */ bucket_bytes = sizeof(HashJoinTuple) * nbuckets; + + *workmem = normalize_workmem(inner_rel_bytes + bucket_bytes); + if (inner_rel_bytes + bucket_bytes > hash_table_bytes) { /* We'll need multiple batches */ @@ -811,7 +817,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, space_allowed, numbuckets, numbatches, - num_skew_mcvs); + num_skew_mcvs, + workmem); return; } @@ -929,7 +936,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, nbatch /= 2; nbuckets *= 2; - *space_allowed = (*space_allowed) * 2; + *total_space_allowed = (*total_space_allowed) * 2; } Assert(nbuckets > 0); diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 66b3c387d53..43df31cdb21 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -1558,6 +1558,24 @@ tbm_calculate_entries(Size maxbytes) return (int) nbuckets; } +/* + * tbm_calculate_bytes + * + * Estimate number of bytes needed to store maxentries hashtable entries. + * + * This function is the inverse of tbm_calculate_entries(), and is used to + * estimate a work_mem limit, based on cardinality. + */ +double +tbm_calculate_bytes(double maxentries) +{ + maxentries = Min(maxentries, INT_MAX - 1); /* safety limit */ + maxentries = Max(maxentries, 16); /* sanity limit */ + + return maxentries * (sizeof(PagetableEntry) + sizeof(Pointer) + + sizeof(Pointer)); +} + /* * Create a shared or private bitmap iterator and start iteration. * diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 73d78617009..7c1fdde842b 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -104,6 +104,7 @@ #include "optimizer/plancat.h" #include "optimizer/restrictinfo.h" #include "parser/parsetree.h" +#include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" #include "utils/spccache.h" @@ -200,9 +201,14 @@ static Cost append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers); static void set_rel_width(PlannerInfo *root, RelOptInfo *rel); static int32 get_expr_width(PlannerInfo *root, const Node *expr); -static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); static double get_parallel_divisor(Path *path); +static void compute_sort_output_sizes(double input_tuples, int input_width, + double limit_tuples, + double *output_tuples, + double *output_bytes); +static double compute_bitmap_workmem(RelOptInfo *baserel, Path *bitmapqual, + Cardinality max_ancestor_rows); /* @@ -1112,6 +1118,18 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, path->disabled_nodes = enable_bitmapscan ? 0 : 1; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + + + /* + * Set an overall working-memory estimate for the entire BitmapHeapPath -- + * including all of the IndexPaths and BitmapOrPaths in its bitmapqual. + * + * (When we convert this path into a BitmapHeapScan plan, we'll break this + * overall estimate down into per-node estimates, just as we do for + * AggPaths.) + */ + path->workmem = compute_bitmap_workmem(baserel, bitmapqual, + 0.0 /* max_ancestor_rows */ ); } /* @@ -1587,6 +1605,16 @@ cost_functionscan(Path *path, PlannerInfo *root, path->disabled_nodes = 0; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + + /* + * Per "XXX" comment above, this workmem estimate is likely to be wrong, + * because the "rows" estimate is pretty phony. Report the estimate + * anyway, for completeness. (This is at least better than saying it won't + * use *any* working memory.) + */ + path->workmem = list_length(rte->functions) * + normalize_workmem(relation_byte_size(path->rows, + path->pathtarget->width)); } /* @@ -1644,6 +1672,16 @@ cost_tablefuncscan(Path *path, PlannerInfo *root, path->disabled_nodes = 0; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + + /* + * Per "XXX" comment above, this workmem estimate is likely to be wrong, + * because the "rows" estimate is pretty phony. Report the estimate + * anyway, for completeness. (This is at least better than saying it won't + * use *any* working memory.) + */ + path->workmem = + normalize_workmem(relation_byte_size(path->rows, + path->pathtarget->width)); } /* @@ -1740,6 +1778,9 @@ cost_ctescan(Path *path, PlannerInfo *root, path->disabled_nodes = 0; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + path->workmem = + normalize_workmem(relation_byte_size(path->rows, + path->pathtarget->width)); } /* @@ -1823,7 +1864,7 @@ cost_resultscan(Path *path, PlannerInfo *root, * We are given Paths for the nonrecursive and recursive terms. */ void -cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) +cost_recursive_union(RecursiveUnionPath *runion, Path *nrterm, Path *rterm) { Cost startup_cost; Cost total_cost; @@ -1850,12 +1891,37 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) */ total_cost += cpu_tuple_cost * total_rows; - runion->disabled_nodes = nrterm->disabled_nodes + rterm->disabled_nodes; - runion->startup_cost = startup_cost; - runion->total_cost = total_cost; - runion->rows = total_rows; - runion->pathtarget->width = Max(nrterm->pathtarget->width, - rterm->pathtarget->width); + runion->path.disabled_nodes = nrterm->disabled_nodes + rterm->disabled_nodes; + runion->path.startup_cost = startup_cost; + runion->path.total_cost = total_cost; + runion->path.rows = total_rows; + runion->path.pathtarget->width = Max(nrterm->pathtarget->width, + rterm->pathtarget->width); + + /* + * Include memory for working and intermediate tables. Since we'll + * repeatedly swap the two tables, use 2x whichever is larger as our + * estimate. + */ + runion->path.workmem = + normalize_workmem( + Max(relation_byte_size(nrterm->rows, + nrterm->pathtarget->width), + relation_byte_size(rterm->rows, + rterm->pathtarget->width)) + * 2); + + if (list_length(runion->distinctList) > 0) + { + /* Also include memory for hash table. */ + Size hashentrysize; + + hashentrysize = MAXALIGN(runion->path.pathtarget->width) + + MAXALIGN(SizeofMinimalTupleHeader); + + runion->path.workmem += + normalize_workmem(runion->numGroups * hashentrysize); + } } /* @@ -1895,7 +1961,7 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound */ static void -cost_tuplesort(Cost *startup_cost, Cost *run_cost, +cost_tuplesort(Cost *startup_cost, Cost *run_cost, Cost *nbytes, double tuples, int width, Cost comparison_cost, int sort_mem, double limit_tuples) @@ -1915,17 +1981,8 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost, /* Include the default cost-per-comparison */ comparison_cost += 2.0 * cpu_operator_cost; - /* Do we have a useful LIMIT? */ - if (limit_tuples > 0 && limit_tuples < tuples) - { - output_tuples = limit_tuples; - output_bytes = relation_byte_size(output_tuples, width); - } - else - { - output_tuples = tuples; - output_bytes = input_bytes; - } + compute_sort_output_sizes(tuples, width, limit_tuples, + &output_tuples, &output_bytes); if (output_bytes > sort_mem_bytes) { @@ -1982,6 +2039,7 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost, * counting the LIMIT otherwise. */ *run_cost = cpu_operator_cost * tuples; + *nbytes = output_bytes; } /* @@ -2011,6 +2069,7 @@ cost_incremental_sort(Path *path, input_groups; Cost group_startup_cost, group_run_cost, + group_nbytes, group_input_run_cost; List *presortedExprs = NIL; ListCell *l; @@ -2085,7 +2144,7 @@ cost_incremental_sort(Path *path, * Estimate the average cost of sorting of one group where presorted keys * are equal. */ - cost_tuplesort(&group_startup_cost, &group_run_cost, + cost_tuplesort(&group_startup_cost, &group_run_cost, &group_nbytes, group_tuples, width, comparison_cost, sort_mem, limit_tuples); @@ -2126,6 +2185,14 @@ cost_incremental_sort(Path *path, path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + + /* + * Incremental sort switches between two Tuplesortstates: one that sorts + * all columns ("full"), and that sorts only suffix columns ("prefix"). + * We'll assume they're both around the same size: large enough to hold + * one sort group. + */ + path->workmem = normalize_workmem(group_nbytes * 2.0); } /* @@ -2150,8 +2217,9 @@ cost_sort(Path *path, PlannerInfo *root, { Cost startup_cost; Cost run_cost; + Cost nbytes; - cost_tuplesort(&startup_cost, &run_cost, + cost_tuplesort(&startup_cost, &run_cost, &nbytes, tuples, width, comparison_cost, sort_mem, limit_tuples); @@ -2162,6 +2230,7 @@ cost_sort(Path *path, PlannerInfo *root, path->disabled_nodes = input_disabled_nodes + (enable_sort ? 0 : 1); path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + path->workmem = normalize_workmem(nbytes); } /* @@ -2522,6 +2591,7 @@ cost_material(Path *path, path->disabled_nodes = input_disabled_nodes + (enable_material ? 0 : 1); path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; + path->workmem = normalize_workmem(nbytes); } /* @@ -2592,6 +2662,9 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0) ndistinct = calls; + /* How much working memory would we need, to store every distinct tuple? */ + mpath->path.workmem = normalize_workmem(ndistinct * est_entry_bytes); + /* * Since we've already estimated the maximum number of entries we can * store at once and know the estimated number of distinct values we'll be @@ -2866,6 +2939,19 @@ cost_agg(Path *path, PlannerInfo *root, path->disabled_nodes = disabled_nodes; path->startup_cost = startup_cost; path->total_cost = total_cost; + + /* Include memory needed to produce output. */ + path->workmem = + compute_agg_output_workmem(root, aggstrategy, numGroups, + aggcosts->transitionSpace, input_tuples, + input_width, false /* cost_sort */ ); + + /* Also include memory needed to sort inputs (if needed): */ + if (aggcosts->numSorts > 0) + { + path->workmem += (double) aggcosts->numSorts * + compute_agg_input_workmem(input_tuples, input_width); + } } /* @@ -3100,7 +3186,7 @@ cost_windowagg(Path *path, PlannerInfo *root, List *windowFuncs, WindowClause *winclause, int input_disabled_nodes, Cost input_startup_cost, Cost input_total_cost, - double input_tuples) + double input_tuples, int width) { Cost startup_cost; Cost total_cost; @@ -3182,6 +3268,11 @@ cost_windowagg(Path *path, PlannerInfo *root, if (startup_tuples > 1.0) path->startup_cost += (total_cost - startup_cost) / input_tuples * (startup_tuples - 1.0); + + + /* We need to store a window of size "startup_tuples", in a Tuplestore. */ + path->workmem = + normalize_workmem(relation_byte_size(startup_tuples, width)); } /* @@ -3336,6 +3427,7 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace, workspace->total_cost = startup_cost + run_cost; /* Save private data for final_cost_nestloop */ workspace->run_cost = run_cost; + workspace->workmem = 0; } /* @@ -3799,6 +3891,14 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace, workspace->total_cost = startup_cost + run_cost + inner_run_cost; /* Save private data for final_cost_mergejoin */ workspace->run_cost = run_cost; + + /* + * By itself, Merge Join requires no working memory. If it adds one or + * more Sort or Material nodes, we'll track their working memory when we + * create them, inside createplan.c. + */ + workspace->workmem = 0; + workspace->inner_run_cost = inner_run_cost; workspace->outer_rows = outer_rows; workspace->inner_rows = inner_rows; @@ -4170,6 +4270,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, double outer_path_rows = outer_path->rows; double inner_path_rows = inner_path->rows; double inner_path_rows_total = inner_path_rows; + int workmem; int num_hashclauses = list_length(hashclauses); int numbuckets; int numbatches; @@ -4227,7 +4328,8 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, &space_allowed, &numbuckets, &numbatches, - &num_skew_mcvs); + &num_skew_mcvs, + &workmem); /* * If inner relation is too big then we will need to "batch" the join, @@ -4258,6 +4360,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, workspace->numbuckets = numbuckets; workspace->numbatches = numbatches; workspace->inner_rows_total = inner_path_rows_total; + workspace->workmem = workmem; } /* @@ -4266,8 +4369,8 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, * * Note: the numbatches estimate is also saved into 'path' for use later * - * 'path' is already filled in except for the rows and cost fields and - * num_batches + * 'path' is already filled in except for the rows and cost fields, + * num_batches, and workmem * 'workspace' is the result from initial_cost_hashjoin * 'extra' contains miscellaneous information about the join */ @@ -4284,6 +4387,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, List *hashclauses = path->path_hashclauses; Cost startup_cost = workspace->startup_cost; Cost run_cost = workspace->run_cost; + int workmem = workspace->workmem; int numbuckets = workspace->numbuckets; int numbatches = workspace->numbatches; Cost cpu_per_tuple; @@ -4510,6 +4614,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, path->jpath.path.startup_cost = startup_cost; path->jpath.path.total_cost = startup_cost + run_cost; + path->jpath.path.workmem = workmem; } @@ -4532,6 +4637,9 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan) if (subplan->useHashTable) { + long nbuckets; + Size hashentrysize; + /* * If we are using a hash table for the subquery outputs, then the * cost of evaluating the query is a one-time cost. We charge one @@ -4541,6 +4649,37 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan) sp_cost.startup += plan->total_cost + cpu_operator_cost * plan->plan_rows; + /* + * Estimate working memory needed for the hashtable (and hashnulls, if + * needed). The logic below MUST match the logic in buildSubPlanHash() + * and ExecInitSubPlan(). + */ + nbuckets = clamp_cardinality_to_long(plan->plan_rows); + if (nbuckets < 1) + nbuckets = 1; + + hashentrysize = MAXALIGN(plan->plan_width) + + MAXALIGN(SizeofMinimalTupleHeader); + + subplan->hashtab_workmem = + normalize_workmem((double) nbuckets * hashentrysize); + + if (!subplan->unknownEqFalse) + { + /* Also needs a hashnulls table. */ + if (IsA(subplan->testexpr, OpExpr)) + nbuckets = 1; /* there can be only one entry */ + else + { + nbuckets /= 16; + if (nbuckets < 1) + nbuckets = 1; + } + + subplan->hashnul_workmem = + normalize_workmem((double) nbuckets * hashentrysize); + } + /* * The per-tuple costs include the cost of evaluating the lefthand * expressions, plus the cost of probing the hashtable. We already @@ -6424,7 +6563,7 @@ get_expr_width(PlannerInfo *root, const Node *expr) * Estimate the storage space in bytes for a given number of tuples * of a given width (size in bytes). */ -static double +double relation_byte_size(double tuples, int width) { return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader)); @@ -6603,3 +6742,197 @@ compute_gather_rows(Path *path) return clamp_row_est(path->rows * get_parallel_divisor(path)); } + +/* + * compute_sort_output_sizes + * Estimate amount of memory and rows needed to hold a Sort operator's output + */ +static void +compute_sort_output_sizes(double input_tuples, int input_width, + double limit_tuples, + double *output_tuples, double *output_bytes) +{ + /* + * We want to be sure the cost of a sort is never estimated as zero, even + * if passed-in tuple count is zero. Besides, mustn't do log(0)... + */ + if (input_tuples < 2.0) + input_tuples = 2.0; + + /* Do we have a useful LIMIT? */ + if (limit_tuples > 0 && limit_tuples < input_tuples) + *output_tuples = limit_tuples; + else + *output_tuples = input_tuples; + + *output_bytes = relation_byte_size(*output_tuples, input_width); +} + +/* + * compute_agg_input_workmem + * Estimate memory (in KB) needed to hold a sort buffer for aggregate's input + * + * Some aggregates involve DISTINCT or ORDER BY, so they need to sort their + * input, before they can process it. We need one sort buffer per such + * aggregate, and this function returns that sort buffer's (estimated) size (in + * KB). + */ +int +compute_agg_input_workmem(double input_tuples, double input_width) +{ + /* Account for size of one buffer needed to sort the input. */ + return normalize_workmem(input_tuples * input_width); +} + +/* + * compute_agg_output_workmem + * Estimate amount of memory needed (in KB) to hold an aggregate's output + * + * In a Hash aggregate, we need space for the hash table that holds the + * aggregated data. + * + * Sort aggregates require output space only if they are part of a Grouping + * Sets chain: the first aggregate writes to its "sort_out" buffer, which the + * second aggregate uses as its "sort_in" buffer, and sorts. + * + * In the latter case, the "Path" code already costs the sort by calling + * cost_sort(), so it passes "cost_sort = false" to this function, to avoid + * double-counting. + */ +int +compute_agg_output_workmem(PlannerInfo *root, AggStrategy aggstrategy, + double numGroups, uint64 transitionSpace, + double input_tuples, double input_width, + bool cost_sort) +{ + /* Account for size of hash table to hold the output. */ + if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED) + { + double hashentrysize; + + hashentrysize = hash_agg_entry_size(list_length(root->aggtransinfos), + input_width, transitionSpace); + return normalize_workmem(numGroups * hashentrysize); + } + + /* Account for the size of the "sort_out" buffer. */ + if (cost_sort && aggstrategy == AGG_SORTED) + { + double output_tuples; /* ignored */ + double output_bytes; + + Assert(aggstrategy == AGG_SORTED); + + compute_sort_output_sizes(numGroups, input_width, + 0.0 /* limit_tuples */ , + &output_tuples, &output_bytes); + return normalize_workmem(output_bytes); + } + + return 0; +} + +/* + * compute_bitmap_workmem + * Estimate total working memory (in KB) needed by bitmapqual + * + * Although we don't fill in the workmem_est or rows fields on the bitmapqual's + * paths, we fill them in on the owning BitmapHeapPath. This function estimates + * the total work_mem needed by all BitmapOrPaths and IndexPaths inside + * bitmapqual. + */ +static double +compute_bitmap_workmem(RelOptInfo *baserel, Path *bitmapqual, + Cardinality max_ancestor_rows) +{ + double workmem = 0.0; + Cost cost; /* not used */ + Selectivity selec; + Cardinality plan_rows; + + /* How many rows will this node output? */ + cost_bitmap_tree_node(bitmapqual, &cost, &selec); + plan_rows = clamp_row_est(selec * baserel->tuples); + + /* + * At runtime, we'll reuse the left-most child's TID bitmap. Let that + * child that child know to request enough working memory to hold all its + * ancestors' results. + */ + max_ancestor_rows = Max(max_ancestor_rows, plan_rows); + + if (IsA(bitmapqual, BitmapAndPath)) + { + BitmapAndPath *apath = (BitmapAndPath *) bitmapqual; + ListCell *l; + + foreach(l, apath->bitmapquals) + { + workmem += + compute_bitmap_workmem(baserel, (Path *) lfirst(l), + foreach_current_index(l) == 0 ? + max_ancestor_rows : 0.0); + } + } + else if (IsA(bitmapqual, BitmapOrPath)) + { + BitmapOrPath *opath = (BitmapOrPath *) bitmapqual; + ListCell *l; + + foreach(l, opath->bitmapquals) + { + workmem += + compute_bitmap_workmem(baserel, (Path *) lfirst(l), + foreach_current_index(l) == 0 ? + max_ancestor_rows : 0.0); + } + } + else if (IsA(bitmapqual, IndexPath)) + { + /* Working memory needed for 1 TID bitmap. */ + workmem += + normalize_workmem(tbm_calculate_bytes(max_ancestor_rows)); + } + + return workmem; +} + +/* + * normalize_workmem + * Convert a double, "bytes" working-memory estimate to an int, "KB" value + * + * Normalizes to a minimum of 64 (KB), rounding up to the nearest whole KB. + */ +int +normalize_workmem(double nbytes) +{ + double workmem; + + /* + * We'll assign working-memory to SQL operators in 1 KB increments, so + * round up to the next whole KB. + */ + workmem = ceil(nbytes / 1024.0); + + /* + * Although some components can probably work with < 64 KB of working + * memory, PostgreSQL has imposed a hard minimum of 64 KB on the + * "work_mem" GUC, for a long time; so, by now, some components probably + * rely on this minimum, implicitly, and would fail if we tried to assign + * them < 64 KB. + * + * Perhaps this minimum can be relaxed, in the future; but memory sizes + * keep increasing, and right now the minimum of 64 KB = 1.6 percent of + * the default "work_mem" of 4 MB. + * + * So, even with this (overly?) cautious normalization, with the default + * GUC settings, we can still achieve a working-memory reduction of + * 64-to-1. + */ + workmem = Max((double) 64, workmem); + + /* And clamp to MAX_KILOBYTES. */ + workmem = Min(workmem, (double) MAX_KILOBYTES); + + return (int) workmem; +} diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 816a2b2a576..973b86371ef 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -130,6 +130,7 @@ static BitmapHeapScan *create_bitmap_scan_plan(PlannerInfo *root, BitmapHeapPath *best_path, List *tlist, List *scan_clauses); static Plan *create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, + Cardinality max_ancestor_rows, List **qual, List **indexqual, List **indexECs); static void bitmap_subplan_mark_shared(Plan *plan); static TidScan *create_tidscan_plan(PlannerInfo *root, TidPath *best_path, @@ -1853,6 +1854,7 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags) groupCollations, NIL, NIL, + 0, /* numSorts */ best_path->path.rows, 0, subplan); @@ -1911,6 +1913,15 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags) /* Copy cost data from Path to Plan */ copy_generic_path_info(plan, &best_path->path); + if (IsA(plan, Unique)) + { + /* + * We assigned "workmem" to the Sort subplan. Clear it from the top- + * level Unique node, to avoid double-counting. + */ + plan->workmem = 0; + } + return plan; } @@ -2228,6 +2239,13 @@ create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path, copy_generic_path_info(&plan->sort.plan, (Path *) best_path); + /* + * IncrementalSort creates two sort buffers, which the Path's "workmem" + * estimate combined into a single value. Split it into two now. + */ + plan->sort.plan.workmem = + normalize_workmem(best_path->spath.path.workmem / 2); + return plan; } @@ -2333,12 +2351,29 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path) subplan->targetlist), NIL, NIL, + best_path->numSorts, best_path->numGroups, best_path->transitionSpace, subplan); copy_generic_path_info(&plan->plan, (Path *) best_path); + /* + * Replace the overall workmem estimate with that we copied from the Path + * with finer-grained estimates. + */ + plan->plan.workmem = + compute_agg_output_workmem(root, plan->aggstrategy, plan->numGroups, + plan->transitionSpace, subplan->plan_rows, + subplan->plan_width, false /* cost_sort */ ); + + /* Also include estimated memory needed to sort the input: */ + if (plan->numSorts > 0) + { + plan->sortWorkMem = compute_agg_input_workmem(subplan->plan_rows, + subplan->plan_width); + } + return plan; } @@ -2457,8 +2492,9 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) RollupData *rollup = lfirst(lc); AttrNumber *new_grpColIdx; Plan *sort_plan = NULL; - Plan *agg_plan; + Agg *agg_plan; AggStrategy strat; + bool cost_sort; new_grpColIdx = remap_groupColIdx(root, rollup->groupClause); @@ -2480,19 +2516,20 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) else strat = AGG_SORTED; - agg_plan = (Plan *) make_agg(NIL, - NIL, - strat, - AGGSPLIT_SIMPLE, - list_length((List *) linitial(rollup->gsets)), - new_grpColIdx, - extract_grouping_ops(rollup->groupClause), - extract_grouping_collations(rollup->groupClause, subplan->targetlist), - rollup->gsets, - NIL, - rollup->numGroups, - best_path->transitionSpace, - sort_plan); + agg_plan = make_agg(NIL, + NIL, + strat, + AGGSPLIT_SIMPLE, + list_length((List *) linitial(rollup->gsets)), + new_grpColIdx, + extract_grouping_ops(rollup->groupClause), + extract_grouping_collations(rollup->groupClause, subplan->targetlist), + rollup->gsets, + NIL, + best_path->numSorts, + rollup->numGroups, + best_path->transitionSpace, + sort_plan); /* * Remove stuff we don't need to avoid bloating debug output. @@ -2503,7 +2540,36 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) sort_plan->lefttree = NULL; } - chain = lappend(chain, agg_plan); + /* + * If we're an AGG_SORTED, but not the last, we need to cost + * working memory needed to produce our "sort_out" buffer. + */ + cost_sort = foreach_current_index(lc) < list_length(rollups) - 1; + + /* + * Although this side node doesn't need accurate cost estimates, + * it does need an accurate *memory* estimate, since we'll use + * that estimate to distribute working memory to this side node, + * at runtime. + */ + + /* Estimated memory needed to hold the output: */ + agg_plan->plan.workmem = + compute_agg_output_workmem(root, agg_plan->aggstrategy, + agg_plan->numGroups, + agg_plan->transitionSpace, + subplan->plan_rows, + subplan->plan_width, cost_sort); + + /* Also include estimated memory needed to sort the input: */ + if (agg_plan->numSorts > 0) + { + agg_plan->sortWorkMem = + compute_agg_input_workmem(subplan->plan_rows, + subplan->plan_width); + } + + chain = lappend(chain, (Plan *) agg_plan); } } @@ -2514,6 +2580,7 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) RollupData *rollup = linitial(rollups); AttrNumber *top_grpColIdx; int numGroupCols; + bool cost_sort; top_grpColIdx = remap_groupColIdx(root, rollup->groupClause); @@ -2529,12 +2596,37 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path) extract_grouping_collations(rollup->groupClause, subplan->targetlist), rollup->gsets, chain, + best_path->numSorts, rollup->numGroups, best_path->transitionSpace, subplan); /* Copy cost data from Path to Plan */ copy_generic_path_info(&plan->plan, &best_path->path); + + /* + * If we're an AGG_SORTED, but not the last, we need to cost working + * memory needed to produce our "sort_out" buffer. + */ + cost_sort = list_length(rollups) > 1; + + /* + * Replace the overall workmem estimate that we copied from the Path + * with finer-grained estimates. + */ + plan->plan.workmem = + compute_agg_output_workmem(root, plan->aggstrategy, plan->numGroups, + plan->transitionSpace, + subplan->plan_rows, subplan->plan_width, + cost_sort); + + /* Also include estimated memory needed to sort the input: */ + if (plan->numSorts > 0) + { + plan->sortWorkMem = + compute_agg_input_workmem(subplan->plan_rows, + subplan->plan_width); + } } return (Plan *) plan; @@ -2783,6 +2875,38 @@ create_recursiveunion_plan(PlannerInfo *root, RecursiveUnionPath *best_path) copy_generic_path_info(&plan->plan, (Path *) best_path); + /* + * Replace our overall "workmem" estimate with estimates at finer + * granularity. + */ + + /* + * Include memory for working and intermediate tables. Since we'll + * repeatedly swap the two tables, use the larger of the two as our + * working- memory estimate. + * + * NOTE: The Path's "workmem" estimate is for the whole Path, but the + * Plan's "workmem" estimates are *per data structure*. So, this value is + * half of the corresponding Path's value. + */ + plan->plan.workmem = + normalize_workmem( + Max(relation_byte_size(leftplan->plan_rows, + leftplan->plan_width), + relation_byte_size(rightplan->plan_rows, + rightplan->plan_width))); + + if (plan->numCols > 0) + { + /* Also include memory for hash table. */ + Size entrysize; + + entrysize = sizeof(TupleHashEntryData) + plan->plan.plan_width; + + plan->hashWorkMem = + normalize_workmem(plan->numGroups * entrysize); + } + return plan; } @@ -3223,6 +3347,7 @@ create_bitmap_scan_plan(PlannerInfo *root, /* Process the bitmapqual tree into a Plan tree and qual lists */ bitmapqualplan = create_bitmap_subplan(root, best_path->bitmapqual, + 0.0 /* max_ancestor_rows */ , &bitmapqualorig, &indexquals, &indexECs); @@ -3309,6 +3434,12 @@ create_bitmap_scan_plan(PlannerInfo *root, copy_generic_path_info(&scan_plan->scan.plan, &best_path->path); + /* + * We assigned "workmem" to the "bitmapqualplan" subplan. Clear it from + * the top-level BitmapHeapScan node, to avoid double-counting. + */ + scan_plan->scan.plan.workmem = 0; + return scan_plan; } @@ -3334,9 +3465,24 @@ create_bitmap_scan_plan(PlannerInfo *root, */ static Plan * create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, + Cardinality max_ancestor_rows, List **qual, List **indexqual, List **indexECs) { Plan *plan; + Cost cost; /* not used */ + Selectivity selec; + Cardinality plan_rows; + + /* How many rows will this node output? */ + cost_bitmap_tree_node(bitmapqual, &cost, &selec); + plan_rows = clamp_row_est(selec * bitmapqual->parent->tuples); + + /* + * At runtime, we'll reuse the left-most child's TID bitmap. Let that + * child that child know to request enough working memory to hold all its + * ancestors' results. + */ + max_ancestor_rows = Max(max_ancestor_rows, plan_rows); if (IsA(bitmapqual, BitmapAndPath)) { @@ -3362,6 +3508,8 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, List *subindexEC; subplan = create_bitmap_subplan(root, (Path *) lfirst(l), + foreach_current_index(l) == 0 ? + max_ancestor_rows : 0.0, &subqual, &subindexqual, &subindexEC); subplans = lappend(subplans, subplan); @@ -3373,8 +3521,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, plan = (Plan *) make_bitmap_and(subplans); plan->startup_cost = apath->path.startup_cost; plan->total_cost = apath->path.total_cost; - plan->plan_rows = - clamp_row_est(apath->bitmapselectivity * apath->path.parent->tuples); + plan->plan_rows = plan_rows; plan->plan_width = 0; /* meaningless */ plan->parallel_aware = false; plan->parallel_safe = apath->path.parallel_safe; @@ -3409,6 +3556,8 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, List *subindexEC; subplan = create_bitmap_subplan(root, (Path *) lfirst(l), + foreach_current_index(l) == 0 ? + max_ancestor_rows : 0.0, &subqual, &subindexqual, &subindexEC); subplans = lappend(subplans, subplan); @@ -3437,8 +3586,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, plan = (Plan *) make_bitmap_or(subplans); plan->startup_cost = opath->path.startup_cost; plan->total_cost = opath->path.total_cost; - plan->plan_rows = - clamp_row_est(opath->bitmapselectivity * opath->path.parent->tuples); + plan->plan_rows = plan_rows; plan->plan_width = 0; /* meaningless */ plan->parallel_aware = false; plan->parallel_safe = opath->path.parallel_safe; @@ -3484,8 +3632,9 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, /* and set its cost/width fields appropriately */ plan->startup_cost = 0.0; plan->total_cost = ipath->indextotalcost; - plan->plan_rows = - clamp_row_est(ipath->indexselectivity * ipath->path.parent->tuples); + plan->workmem = + normalize_workmem(tbm_calculate_bytes(max_ancestor_rows)); + plan->plan_rows = plan_rows; plan->plan_width = 0; /* meaningless */ plan->parallel_aware = false; plan->parallel_safe = ipath->path.parallel_safe; @@ -3796,6 +3945,14 @@ create_functionscan_plan(PlannerInfo *root, Path *best_path, copy_generic_path_info(&scan_plan->scan.plan, best_path); + /* + * Replace the path's total working-memory estimate with a per-function + * estimate. + */ + scan_plan->scan.plan.workmem = + normalize_workmem(relation_byte_size(scan_plan->scan.plan.plan_rows, + scan_plan->scan.plan.plan_width)); + return scan_plan; } @@ -4615,6 +4772,9 @@ create_mergejoin_plan(PlannerInfo *root, */ copy_plan_costsize(matplan, inner_plan); matplan->total_cost += cpu_operator_cost * matplan->plan_rows; + matplan->workmem = + normalize_workmem(relation_byte_size(matplan->plan_rows, + matplan->plan_width)); inner_plan = matplan; } @@ -4961,6 +5121,10 @@ create_hashjoin_plan(PlannerInfo *root, copy_generic_path_info(&join_plan->join.plan, &best_path->jpath.path); + /* Display "workmem" on the Hash subnode, not its parent HashJoin node. */ + hash_plan->plan.workmem = join_plan->join.plan.workmem; + join_plan->join.plan.workmem = 0; + return join_plan; } @@ -5458,6 +5622,7 @@ copy_generic_path_info(Plan *dest, Path *src) dest->disabled_nodes = src->disabled_nodes; dest->startup_cost = src->startup_cost; dest->total_cost = src->total_cost; + dest->workmem = (int) Min(src->workmem, (double) MAX_KILOBYTES); dest->plan_rows = src->rows; dest->plan_width = src->pathtarget->width; dest->parallel_aware = src->parallel_aware; @@ -5474,6 +5639,7 @@ copy_plan_costsize(Plan *dest, Plan *src) dest->disabled_nodes = src->disabled_nodes; dest->startup_cost = src->startup_cost; dest->total_cost = src->total_cost; + dest->workmem = src->workmem; dest->plan_rows = src->plan_rows; dest->plan_width = src->plan_width; /* Assume the inserted node is not parallel-aware. */ @@ -5509,6 +5675,7 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples) limit_tuples); plan->plan.startup_cost = sort_path.startup_cost; plan->plan.total_cost = sort_path.total_cost; + plan->plan.workmem = (int) Min(sort_path.workmem, (double) MAX_KILOBYTES); plan->plan.plan_rows = lefttree->plan_rows; plan->plan.plan_width = lefttree->plan_width; plan->plan.parallel_aware = false; @@ -5540,6 +5707,8 @@ label_incrementalsort_with_costsize(PlannerInfo *root, IncrementalSort *plan, limit_tuples); plan->sort.plan.startup_cost = sort_path.startup_cost; plan->sort.plan.total_cost = sort_path.total_cost; + plan->sort.plan.workmem = (int) Min(sort_path.workmem, + (double) MAX_KILOBYTES); plan->sort.plan.plan_rows = lefttree->plan_rows; plan->sort.plan.plan_width = lefttree->plan_width; plan->sort.plan.parallel_aware = false; @@ -6673,7 +6842,7 @@ Agg * make_agg(List *tlist, List *qual, AggStrategy aggstrategy, AggSplit aggsplit, int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators, Oid *grpCollations, - List *groupingSets, List *chain, double dNumGroups, + List *groupingSets, List *chain, int numSorts, double dNumGroups, Size transitionSpace, Plan *lefttree) { Agg *node = makeNode(Agg); @@ -6689,6 +6858,8 @@ make_agg(List *tlist, List *qual, node->grpColIdx = grpColIdx; node->grpOperators = grpOperators; node->grpCollations = grpCollations; + node->numSorts = numSorts; + node->sortWorkMem = 0; /* caller will fill this */ node->numGroups = numGroups; node->transitionSpace = transitionSpace; node->aggParams = NULL; /* SS_finalize_plan() will fill this */ diff --git a/src/backend/optimizer/prep/prepagg.c b/src/backend/optimizer/prep/prepagg.c index c0a2f04a8c3..3eba364484d 100644 --- a/src/backend/optimizer/prep/prepagg.c +++ b/src/backend/optimizer/prep/prepagg.c @@ -691,5 +691,17 @@ get_agg_clause_costs(PlannerInfo *root, AggSplit aggsplit, AggClauseCosts *costs costs->finalCost.startup += argcosts.startup; costs->finalCost.per_tuple += argcosts.per_tuple; } + + /* + * How many aggrefs need to sort their input? (Each such aggref gets + * its own sort buffer. The logic here MUST match the corresponding + * logic in function build_pertrans_for_aggref().) + */ + if (!AGGKIND_IS_ORDERED_SET(aggref->aggkind) && + !aggref->aggpresorted && + (aggref->aggdistinct || aggref->aggorder)) + { + ++costs->numSorts; + } } } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 93e73cb44db..c533bfb9a58 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1709,6 +1709,13 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.total_cost = subpath->total_cost + cpu_tuple_cost; pathnode->path.rows = subpath->rows; + /* + * For now, set workmem at hash memory limit. Function + * cost_memoize_rescan() will adjust this field, same as it does for field + * "est_entries". + */ + pathnode->path.workmem = normalize_workmem(get_hash_memory_limit()); + return pathnode; } @@ -1937,12 +1944,14 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.disabled_nodes = agg_path.disabled_nodes; pathnode->path.startup_cost = agg_path.startup_cost; pathnode->path.total_cost = agg_path.total_cost; + pathnode->path.workmem = agg_path.workmem; } else { pathnode->path.disabled_nodes = sort_path.disabled_nodes; pathnode->path.startup_cost = sort_path.startup_cost; pathnode->path.total_cost = sort_path.total_cost; + pathnode->path.workmem = sort_path.workmem; } rel->cheapest_unique_path = (Path *) pathnode; @@ -2289,6 +2298,13 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel, /* Cost is the same as for a regular CTE scan */ cost_ctescan(pathnode, root, rel, pathnode->param_info); + /* + * But working memory used is 0, since the worktable scan doesn't create a + * tuplestore -- it just reuses a tuplestore already created by a + * recursive union. + */ + pathnode->workmem = 0; + return pathnode; } @@ -3283,6 +3299,7 @@ create_agg_path(PlannerInfo *root, pathnode->aggstrategy = aggstrategy; pathnode->aggsplit = aggsplit; + pathnode->numSorts = aggcosts ? aggcosts->numSorts : 0; pathnode->numGroups = numGroups; pathnode->transitionSpace = aggcosts ? aggcosts->transitionSpace : 0; pathnode->groupClause = groupClause; @@ -3333,6 +3350,8 @@ create_groupingsets_path(PlannerInfo *root, ListCell *lc; bool is_first = true; bool is_first_sort = true; + int num_sort_nodes = 0; + double max_sort_workmem = 0.0; /* The topmost generated Plan node will be an Agg */ pathnode->path.pathtype = T_Agg; @@ -3369,6 +3388,7 @@ create_groupingsets_path(PlannerInfo *root, pathnode->path.pathkeys = NIL; pathnode->aggstrategy = aggstrategy; + pathnode->numSorts = agg_costs ? agg_costs->numSorts : 0; pathnode->rollups = rollups; pathnode->qual = having_qual; pathnode->transitionSpace = agg_costs ? agg_costs->transitionSpace : 0; @@ -3432,6 +3452,8 @@ create_groupingsets_path(PlannerInfo *root, subpath->pathtarget->width); if (!rollup->is_hashed) is_first_sort = false; + + pathnode->path.workmem += agg_path.workmem; } else { @@ -3444,6 +3466,12 @@ create_groupingsets_path(PlannerInfo *root, work_mem, -1.0); + /* + * We costed sorting the previous "sort" rollup's "sort_out" + * buffer. How much memory did it need? + */ + max_sort_workmem = Max(max_sort_workmem, sort_path.workmem); + /* Account for cost of aggregation */ cost_agg(&agg_path, root, @@ -3457,12 +3485,17 @@ create_groupingsets_path(PlannerInfo *root, sort_path.total_cost, sort_path.rows, subpath->pathtarget->width); + + pathnode->path.workmem += agg_path.workmem; } pathnode->path.disabled_nodes += agg_path.disabled_nodes; pathnode->path.total_cost += agg_path.total_cost; pathnode->path.rows += agg_path.rows; } + + if (!rollup->is_hashed) + ++num_sort_nodes; } /* add tlist eval cost for each output row */ @@ -3470,6 +3503,17 @@ create_groupingsets_path(PlannerInfo *root, pathnode->path.total_cost += target->cost.startup + target->cost.per_tuple * pathnode->path.rows; + /* + * Include working memory needed to sort agg output. If there's only 1 + * sort rollup, then we don't need any memory. If there are 2 sort + * rollups, we need enough memory for 1 sort buffer. If there are >= 3 + * sort rollups, we need only 2 sort buffers, since we're + * double-buffering. + */ + pathnode->path.workmem += num_sort_nodes > 2 ? + max_sort_workmem * 2.0 : + max_sort_workmem; + return pathnode; } @@ -3619,7 +3663,8 @@ create_windowagg_path(PlannerInfo *root, subpath->disabled_nodes, subpath->startup_cost, subpath->total_cost, - subpath->rows); + subpath->rows, + subpath->pathtarget->width); /* add tlist eval cost for each output row */ pathnode->path.startup_cost += target->cost.startup; @@ -3744,7 +3789,11 @@ create_setop_path(PlannerInfo *root, MAXALIGN(SizeofMinimalTupleHeader); if (hashentrysize * numGroups > get_hash_memory_limit()) pathnode->path.disabled_nodes++; + + pathnode->path.workmem = + normalize_workmem(numGroups * hashentrysize); } + pathnode->path.rows = outputRows; return pathnode; @@ -3795,7 +3844,7 @@ create_recursiveunion_path(PlannerInfo *root, pathnode->wtParam = wtParam; pathnode->numGroups = numGroups; - cost_recursive_union(&pathnode->path, leftpath, rightpath); + cost_recursive_union(pathnode, leftpath, rightpath); return pathnode; } diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 570e7cad1fa..50454952eb2 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -53,6 +53,7 @@ typedef struct ExplainState bool timing; /* print detailed node timing */ bool summary; /* print total planning and execution timing */ bool memory; /* print planner's memory usage information */ + bool work_mem; /* print work_mem estimates per node */ bool settings; /* print modified settings */ bool generic; /* generate a generic plan */ ExplainSerializeOption serialize; /* serialize the query's output? */ @@ -69,6 +70,8 @@ typedef struct ExplainState bool hide_workers; /* set if we find an invisible Gather */ int rtable_size; /* length of rtable excluding the RTE_GROUP * entry */ + int num_workers; /* # of worker processes planned to use */ + double total_workmem; /* total working memory estimate (in bytes) */ /* state related to the current plan node */ ExplainWorkersState *workers_state; /* needed if parallel plan */ } ExplainState; diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h index 3c1a09415aa..fc5b20994dd 100644 --- a/src/include/executor/nodeHash.h +++ b/src/include/executor/nodeHash.h @@ -62,7 +62,8 @@ extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, size_t *space_allowed, int *numbuckets, int *numbatches, - int *num_skew_mcvs); + int *num_skew_mcvs, + int *workmem); extern int ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue); extern void ExecHashEstimate(HashState *node, ParallelContext *pcxt); extern void ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt); diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index fbf05322c75..17eb6b52579 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -60,6 +60,7 @@ typedef struct AggClauseCosts QualCost transCost; /* total per-input-row execution costs */ QualCost finalCost; /* total per-aggregated-row costs */ Size transitionSpace; /* space for pass-by-ref transition data */ + int numSorts; /* # of required input-sort buffers */ } AggClauseCosts; /* @@ -1697,6 +1698,13 @@ typedef struct Path Cost startup_cost; /* cost expended before fetching any tuples */ Cost total_cost; /* total cost (assuming all tuples fetched) */ + /* + * NOTE: The Path's workmem is a double, rather than an int, because it + * sometimes combines multiple working-memory estimates (e.g., for + * GroupingSetsPath). + */ + Cost workmem; /* estimated work_mem (in KB) */ + /* sort ordering of path's output; a List of PathKey nodes; see above */ List *pathkeys; } Path; @@ -2290,6 +2298,7 @@ typedef struct AggPath Path *subpath; /* path representing input source */ AggStrategy aggstrategy; /* basic strategy, see nodes.h */ AggSplit aggsplit; /* agg-splitting mode, see nodes.h */ + int numSorts; /* number of inputs that require sorting */ Cardinality numGroups; /* estimated number of groups in input */ uint64 transitionSpace; /* for pass-by-ref transition data */ List *groupClause; /* a list of SortGroupClause's */ @@ -2331,6 +2340,7 @@ typedef struct GroupingSetsPath Path path; Path *subpath; /* path representing input source */ AggStrategy aggstrategy; /* basic strategy */ + int numSorts; /* number of inputs that require sorting */ List *rollups; /* list of RollupData */ List *qual; /* quals (HAVING quals), if any */ uint64 transitionSpace; /* for pass-by-ref transition data */ @@ -3374,6 +3384,7 @@ typedef struct JoinCostWorkspace /* Fields below here should be treated as private to costsize.c */ Cost run_cost; /* non-startup cost components */ + Cost workmem; /* estimated work_mem (in KB) */ /* private for cost_nestloop code */ Cost inner_run_cost; /* also used by cost_mergejoin code */ diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index bf1f25c0dba..67da7f091b5 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -168,6 +168,8 @@ typedef struct Plan /* total cost (assuming all tuples fetched) */ Cost total_cost; + int workmem; /* estimated work_mem (in KB) */ + /* * planner's estimate of result size of this plan step */ @@ -426,6 +428,9 @@ typedef struct RecursiveUnion /* estimated number of groups in input */ long numGroups; + + /* estimated work_mem for hash table (in KB) */ + int hashWorkMem; } RecursiveUnion; /* ---------------- @@ -1145,6 +1150,12 @@ typedef struct Agg Oid *grpOperators pg_node_attr(array_size(numCols)); Oid *grpCollations pg_node_attr(array_size(numCols)); + /* number of inputs that require sorting */ + int numSorts; + + /* estimated work_mem needed to sort each input (in KB) */ + int sortWorkMem; + /* estimated number of groups in input */ long numGroups; diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index 839e71d52f4..b7d6b0fe7dc 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -1109,6 +1109,8 @@ typedef struct SubPlan /* Estimated execution costs: */ Cost startup_cost; /* one-time setup cost */ Cost per_call_cost; /* cost for each subplan evaluation */ + int hashtab_workmem; /* estimated hashtable work_mem (in KB) */ + int hashnul_workmem; /* estimated hashnulls work_mem (in KB) */ } SubPlan; /* diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index a6ffeac90be..df8e7de9dc2 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -85,6 +85,7 @@ extern void tbm_end_shared_iterate(TBMSharedIterator *iterator); extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp); extern int tbm_calculate_entries(Size maxbytes); +extern double tbm_calculate_bytes(double maxentries); extern TBMIterator tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp); diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 3aa3c16e442..737c553a409 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -106,7 +106,7 @@ extern void cost_namedtuplestorescan(Path *path, PlannerInfo *root, RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_resultscan(Path *path, PlannerInfo *root, RelOptInfo *baserel, ParamPathInfo *param_info); -extern void cost_recursive_union(Path *runion, Path *nrterm, Path *rterm); +extern void cost_recursive_union(RecursiveUnionPath *runion, Path *nrterm, Path *rterm); extern void cost_sort(Path *path, PlannerInfo *root, List *pathkeys, int disabled_nodes, Cost input_cost, double tuples, int width, @@ -139,7 +139,7 @@ extern void cost_windowagg(Path *path, PlannerInfo *root, List *windowFuncs, WindowClause *winclause, int input_disabled_nodes, Cost input_startup_cost, Cost input_total_cost, - double input_tuples); + double input_tuples, int width); extern void cost_group(Path *path, PlannerInfo *root, int numGroupCols, double numGroups, List *quals, @@ -217,9 +217,17 @@ extern void set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *re extern void set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern void set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern PathTarget *set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target); +extern double relation_byte_size(double tuples, int width); extern double compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, double loop_count, Cost *cost_p, double *tuples_p); extern double compute_gather_rows(Path *path); +extern int compute_agg_input_workmem(double input_tuples, double input_width); +extern int compute_agg_output_workmem(PlannerInfo *root, + AggStrategy aggstrategy, + double numGroups, uint64 transitionSpace, + double input_tuples, double input_width, + bool cost_sort); +extern int normalize_workmem(double nbytes); #endif /* COST_H */ diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 5a930199611..cf3694a744f 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -55,7 +55,7 @@ extern Sort *make_sort_from_sortclauses(List *sortcls, Plan *lefttree); extern Agg *make_agg(List *tlist, List *qual, AggStrategy aggstrategy, AggSplit aggsplit, int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators, Oid *grpCollations, - List *groupingSets, List *chain, double dNumGroups, + List *groupingSets, List *chain, int numSorts, double dNumGroups, Size transitionSpace, Plan *lefttree); extern Limit *make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, LimitOption limitOption, int uniqNumCols, diff --git a/src/test/regress/expected/workmem.out b/src/test/regress/expected/workmem.out new file mode 100644 index 00000000000..215180808f4 --- /dev/null +++ b/src/test/regress/expected/workmem.out @@ -0,0 +1,631 @@ +---- +-- Tests that show "work_mem" output to EXPLAIN plans. +---- +-- Note: Function derived from file explain.sql. We can't use that other +-- function, since we're run in parallel with explain.sql. +create or replace function workmem_filter(text) returns setof text +language plpgsql as +$$ +declare + ln text; +begin + for ln in execute $1 + loop + -- Mask out work_mem estimate, since it might be brittle + ln := regexp_replace(ln, '\mwork_mem=\d+\M', 'work_mem=N', 'g'); + ln := regexp_replace(ln, '\mMemory: \d+\M', 'Memory: N', 'g'); + return next ln; + end loop; +end; +$$; +-- Unique -> hash agg +set enable_hashagg = on; +select workmem_filter(' +explain (costs off, work_mem on) +select * +from onek +where (unique1,ten) in (values (1,1), (20,0), (99,9), (17,99)) +order by unique1; +'); + workmem_filter +----------------------------------------------------------------- + Sort (work_mem=N kB) + Sort Key: onek.unique1 + -> Nested Loop + -> HashAggregate (work_mem=N kB) + Group Key: "*VALUES*".column1, "*VALUES*".column2 + -> Values Scan on "*VALUES*" + -> Index Scan using onek_unique1 on onek + Index Cond: (unique1 = "*VALUES*".column1) + Filter: ("*VALUES*".column2 = ten) + Total Working Memory: N kB +(10 rows) + +select * +from onek +where (unique1,ten) in (values (1,1), (20,0), (99,9), (17,99)) +order by unique1; + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 1 | 214 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | BAAAAA | GIAAAA | OOOOxx + 20 | 306 | 0 | 0 | 0 | 0 | 0 | 20 | 20 | 20 | 20 | 0 | 1 | UAAAAA | ULAAAA | OOOOxx + 99 | 101 | 1 | 3 | 9 | 19 | 9 | 99 | 99 | 99 | 99 | 18 | 19 | VDAAAA | XDAAAA | HHHHxx +(3 rows) + +reset enable_hashagg; +-- Unique -> sort +set enable_hashagg = off; +select workmem_filter(' +explain (costs off, work_mem on) +select * +from onek +where (unique1,ten) in (values (1,1), (20,0), (99,9), (17,99)) +order by unique1; +'); + workmem_filter +---------------------------------------------------------------------- + Sort (work_mem=N kB) + Sort Key: onek.unique1 + -> Nested Loop + -> Unique + -> Sort (work_mem=N kB) + Sort Key: "*VALUES*".column1, "*VALUES*".column2 + -> Values Scan on "*VALUES*" + -> Index Scan using onek_unique1 on onek + Index Cond: (unique1 = "*VALUES*".column1) + Filter: ("*VALUES*".column2 = ten) + Total Working Memory: N kB +(11 rows) + +select * +from onek +where (unique1,ten) in (values (1,1), (20,0), (99,9), (17,99)) +order by unique1; + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 1 | 214 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | BAAAAA | GIAAAA | OOOOxx + 20 | 306 | 0 | 0 | 0 | 0 | 0 | 20 | 20 | 20 | 20 | 0 | 1 | UAAAAA | ULAAAA | OOOOxx + 99 | 101 | 1 | 3 | 9 | 19 | 9 | 99 | 99 | 99 | 99 | 18 | 19 | VDAAAA | XDAAAA | HHHHxx +(3 rows) + +reset enable_hashagg; +-- Incremental Sort +select workmem_filter(' +explain (costs off, work_mem on) +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; +'); + workmem_filter +----------------------------------------- + Limit + -> Incremental Sort (work_mem=N kB) + Sort Key: tenk1.four, tenk1.ten + Presorted Key: tenk1.four + -> Sort (work_mem=N kB) + Sort Key: tenk1.four + -> Seq Scan on tenk1 + Total Working Memory: N kB +(8 rows) + +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 4220 | 5017 | 0 | 0 | 0 | 0 | 20 | 220 | 220 | 4220 | 4220 | 40 | 41 | IGAAAA | ZKHAAA | HHHHxx +(1 row) + +-- Hash Join +select workmem_filter(' +explain (costs off, work_mem on) +select count(*) from ( +select t1.unique1, t2.hundred +from onek t1, tenk1 t2 +where exists (select 1 from tenk1 t3 + where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) + and t1.unique1 < 1 +) t; +'); + workmem_filter +-------------------------------------------------------------------------------- + Aggregate + -> Nested Loop + -> Hash Join + Hash Cond: (t3.thousand = t1.unique1) + -> HashAggregate (work_mem=N kB) + Group Key: t3.thousand, t3.tenthous + -> Index Only Scan using tenk1_thous_tenthous on tenk1 t3 + -> Hash (work_mem=N kB) + -> Index Only Scan using onek_unique1 on onek t1 + Index Cond: (unique1 < 1) + -> Index Only Scan using tenk1_hundred on tenk1 t2 + Index Cond: (hundred = t3.tenthous) + Total Working Memory: N kB +(13 rows) + +select count(*) from ( +select t1.unique1, t2.hundred +from onek t1, tenk1 t2 +where exists (select 1 from tenk1 t3 + where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) + and t1.unique1 < 1 +) t; + count +------- + 100 +(1 row) + +-- Materialize +select workmem_filter(' +explain (costs off, work_mem on) +select count(*) from ( +select t1.f1 +from int4_tbl t1, int4_tbl t2 + left join int4_tbl t3 on t3.f1 > 0 + left join int4_tbl t4 on t3.f1 > 1 +where t4.f1 is null +) t; +'); + workmem_filter +------------------------------------------------------------- + Aggregate + -> Nested Loop + -> Nested Loop Left Join + Filter: (t4.f1 IS NULL) + -> Seq Scan on int4_tbl t2 + -> Materialize (work_mem=N kB) + -> Nested Loop Left Join + Join Filter: (t3.f1 > 1) + -> Seq Scan on int4_tbl t3 + Filter: (f1 > 0) + -> Materialize (work_mem=N kB) + -> Seq Scan on int4_tbl t4 + -> Seq Scan on int4_tbl t1 + Total Working Memory: N kB +(14 rows) + +select count(*) from ( +select t1.f1 +from int4_tbl t1, int4_tbl t2 + left join int4_tbl t3 on t3.f1 > 0 + left join int4_tbl t4 on t3.f1 > 1 +where t4.f1 is null +) t; + count +------- + 0 +(1 row) + +-- Grouping Sets (Hash) +select workmem_filter(' +explain (costs off, work_mem on) +select a, b, row_number() over (order by a, b nulls first) +from (values (1, 1), (2, 2)) as t (a, b) where a = b +group by grouping sets((a, b), (a)); +'); + workmem_filter +---------------------------------------------------------------------- + WindowAgg (work_mem=N kB) + -> Sort (work_mem=N kB) + Sort Key: "*VALUES*".column1, "*VALUES*".column2 NULLS FIRST + -> HashAggregate (work_mem=N kB) + Hash Key: "*VALUES*".column1, "*VALUES*".column2 + Hash Key: "*VALUES*".column1 + -> Values Scan on "*VALUES*" + Filter: (column1 = column2) + Total Working Memory: N kB +(9 rows) + +select a, b, row_number() over (order by a, b nulls first) +from (values (1, 1), (2, 2)) as t (a, b) where a = b +group by grouping sets((a, b), (a)); + a | b | row_number +---+---+------------ + 1 | | 1 + 1 | 1 | 2 + 2 | | 3 + 2 | 2 | 4 +(4 rows) + +-- Grouping Sets (Sort) +set enable_hashagg = off; +select workmem_filter(' +explain (costs off, work_mem on) +select a, b, row_number() over (order by a, b nulls first) +from (values (1, 1, 1, 1), (2, 2, 2, 2)) as t (a, b, c, d) where a = b +group by grouping sets((a, b), (a), (b), (c), (d)); +'); + workmem_filter +---------------------------------------------------------------------- + WindowAgg (work_mem=N kB) + -> Sort (work_mem=N kB) + Sort Key: "*VALUES*".column1, "*VALUES*".column2 NULLS FIRST + -> GroupAggregate (work_mem=N kB) + Group Key: "*VALUES*".column1, "*VALUES*".column2 + Group Key: "*VALUES*".column1 + Sort Key: "*VALUES*".column2 + Group Key: "*VALUES*".column2 + Sort Key: "*VALUES*".column3 + Group Key: "*VALUES*".column3 + Sort Key: "*VALUES*".column4 + Group Key: "*VALUES*".column4 + -> Sort (work_mem=N kB) + Sort Key: "*VALUES*".column1 + -> Values Scan on "*VALUES*" + Filter: (column1 = column2) + Total Working Memory: N kB +(17 rows) + +select a, b, row_number() over (order by a, b nulls first) +from (values (1, 1, 1, 1), (2, 2, 2, 2)) as t (a, b, c, d) where a = b +group by grouping sets((a, b), (a), (b), (c), (d)); + a | b | row_number +---+---+------------ + 1 | | 1 + 1 | 1 | 2 + 2 | | 3 + 2 | 2 | 4 + | | 5 + | | 6 + | | 7 + | | 8 + | 1 | 9 + | 2 | 10 +(10 rows) + +reset enable_hashagg; +-- Agg (hash, parallel) +set parallel_setup_cost=0; +set parallel_tuple_cost=0; +set min_parallel_table_scan_size=0; +set max_parallel_workers_per_gather=4; +select workmem_filter(' +explain (costs off, work_mem on) +select length(stringu1) from tenk1 group by length(stringu1); +'); + workmem_filter +---------------------------------------------------- + Finalize HashAggregate (work_mem=N kB) + Group Key: (length((stringu1)::text)) + -> Gather + Workers Planned: 4 + -> Partial HashAggregate (work_mem=N kB) + Group Key: length((stringu1)::text) + -> Parallel Seq Scan on tenk1 + Total Working Memory: N kB +(8 rows) + +select length(stringu1) from tenk1 group by length(stringu1); + length +-------- + 6 +(1 row) + +reset parallel_setup_cost; +reset parallel_tuple_cost; +reset min_parallel_table_scan_size; +reset max_parallel_workers_per_gather; +-- Agg (simple) [no work_mem] +explain (costs off, work_mem on) +select MAX(length(stringu1)) from tenk1; + QUERY PLAN +---------------------------- + Aggregate + -> Seq Scan on tenk1 + Total Working Memory: 0 kB +(3 rows) + +select MAX(length(stringu1)) from tenk1; + max +----- + 6 +(1 row) + +-- Function Scan +select workmem_filter(' +explain (work_mem on, costs off) +select count(*) from ( +select sum(n) over(partition by m) +from (SELECT n < 3 as m, n from generate_series(1,2000) a(n)) +) t; +'); + workmem_filter +----------------------------------------------------------- + Aggregate + -> Function Scan on generate_series a (work_mem=N kB) + Total Working Memory: N kB +(3 rows) + +select count(*) from ( +select sum(n) over(partition by m) +from (SELECT n < 3 as m, n from generate_series(1,2000) a(n)) +) t; + count +------- + 2000 +(1 row) + +-- Three Function Scans +select workmem_filter(' +explain (work_mem on, costs off) +select count(*) +from rows from(generate_series(1, 5), + generate_series(2, 10), + generate_series(4, 15)); +'); + workmem_filter +--------------------------------------------------------- + Aggregate + -> Function Scan on generate_series (work_mem=N kB) + Total Working Memory: N kB +(3 rows) + +select count(*) +from rows from(generate_series(1, 5), + generate_series(2, 10), + generate_series(4, 15)); + count +------- + 12 +(1 row) + +-- Table Function Scan +select workmem_filter(' +EXPLAIN (COSTS OFF, work_mem on) +SELECT xmltable.* + FROM (SELECT data FROM xmldata) x, + LATERAL XMLTABLE(''/ROWS/ROW'' + PASSING data + COLUMNS id int PATH ''@id'', + _id FOR ORDINALITY, + country_name text PATH ''COUNTRY_NAME'' NOT NULL, + country_id text PATH ''COUNTRY_ID'', + region_id int PATH ''REGION_ID'', + size float PATH ''SIZE'', + unit text PATH ''SIZE/@unit'', + premier_name text PATH ''PREMIER_NAME'' DEFAULT ''not specified''); +'); + workmem_filter +---------------------------------------------------------- + Nested Loop + -> Seq Scan on xmldata + -> Table Function Scan on "xmltable" (work_mem=N kB) + Total Working Memory: N kB +(4 rows) + +SELECT xmltable.* + FROM (SELECT data FROM xmldata) x, + LATERAL XMLTABLE('/ROWS/ROW' + PASSING data + COLUMNS id int PATH '@id', + _id FOR ORDINALITY, + country_name text PATH 'COUNTRY_NAME' NOT NULL, + country_id text PATH 'COUNTRY_ID', + region_id int PATH 'REGION_ID', + size float PATH 'SIZE', + unit text PATH 'SIZE/@unit', + premier_name text PATH 'PREMIER_NAME' DEFAULT 'not specified'); + id | _id | country_name | country_id | region_id | size | unit | premier_name +----+-----+--------------+------------+-----------+------+------+-------------- +(0 rows) + +-- SetOp [no work_mem] +explain (costs off, work_mem on) +select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10; + QUERY PLAN +------------------------------------------------------------ + SetOp Except + -> Index Only Scan using tenk1_unique1 on tenk1 + -> Index Only Scan using tenk1_unique2 on tenk1 tenk1_1 + Filter: (unique2 <> 10) + Total Working Memory: 0 kB +(5 rows) + +select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10; + unique1 +--------- + 10 +(1 row) + +-- HashSetOp +select workmem_filter(' +explain (costs off, work_mem on) +select count(*) from + ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss; +'); + workmem_filter +------------------------------------------------------------------ + Aggregate + -> HashSetOp Intersect (work_mem=N kB) + -> Seq Scan on tenk1 + -> Index Only Scan using tenk1_unique1 on tenk1 tenk1_1 + Total Working Memory: N kB +(5 rows) + +select count(*) from + ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss; + count +------- + 5000 +(1 row) + +-- RecursiveUnion and Memoize (also WorkTable Scan [no work_mem]) +select workmem_filter(' +explain (costs off, work_mem on) +select sum(o.four), sum(ss.a) from onek o +cross join lateral (with recursive x(a) as ( + select o.four as a union select a + 1 from x where a < 10) + select * from x) ss where o.ten = 1; +'); + workmem_filter +------------------------------------------------------------ + Aggregate + -> Nested Loop + -> Seq Scan on onek o + Filter: (ten = 1) + -> Memoize (work_mem=N kB) + Cache Key: o.four + Cache Mode: binary + -> CTE Scan on x (work_mem=N kB) + CTE x + -> Recursive Union (work_mem=N kB) + -> Result + -> WorkTable Scan on x x_1 + Filter: (a < 10) + Total Working Memory: N kB +(14 rows) + +select sum(o.four), sum(ss.a) from onek o +cross join lateral (with recursive x(a) as ( + select o.four as a union select a + 1 from x where a < 10) + select * from x) ss where o.ten = 1; + sum | sum +------+------ + 1700 | 5350 +(1 row) + +-- CTE Scan +select workmem_filter(' +explain (costs off, work_mem on) +WITH q1(x,y) AS ( + SELECT hundred, sum(ten) FROM tenk1 GROUP BY hundred + ) +SELECT count(*) FROM q1 WHERE y > (SELECT sum(y)/100 FROM q1 qsub); +'); + workmem_filter +---------------------------------------------------- + Aggregate + CTE q1 + -> HashAggregate (work_mem=N kB) + Group Key: tenk1.hundred + -> Seq Scan on tenk1 + InitPlan 2 + -> Aggregate + -> CTE Scan on q1 qsub (work_mem=N kB) + -> CTE Scan on q1 (work_mem=N kB) + Filter: ((y)::numeric > (InitPlan 2).col1) + Total Working Memory: N kB +(11 rows) + +WITH q1(x,y) AS ( + SELECT hundred, sum(ten) FROM tenk1 GROUP BY hundred + ) +SELECT count(*) FROM q1 WHERE y > (SELECT sum(y)/100 FROM q1 qsub); + count +------- + 50 +(1 row) + +-- WindowAgg +select workmem_filter(' +explain (costs off, work_mem on) +select sum(n) over(partition by m) +from (SELECT n < 3 as m, n from generate_series(1,2000) a(n)) +limit 5; +'); + workmem_filter +----------------------------------------------------------------------- + Limit + -> WindowAgg (work_mem=N kB) + -> Sort (work_mem=N kB) + Sort Key: ((a.n < 3)) + -> Function Scan on generate_series a (work_mem=N kB) + Total Working Memory: N kB +(6 rows) + +select sum(n) over(partition by m) +from (SELECT n < 3 as m, n from generate_series(1,2000) a(n)) +limit 5; + sum +--------- + 2000997 + 2000997 + 2000997 + 2000997 + 2000997 +(5 rows) + +-- Bitmap Heap Scan +select workmem_filter(' +explain (costs off, work_mem on) +select count(*) from ( +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or (a.unique2 = 3 and b.hundred = 4) +); +'); + workmem_filter +------------------------------------------------------------------------------------------------------- + Aggregate + -> Nested Loop + Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.hundred = 4))) + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_hundred (work_mem=N kB) + Index Cond: (hundred = 4) + -> Bitmap Index Scan on tenk1_unique1 (work_mem=N kB) + Index Cond: (unique1 = 2) + -> Materialize (work_mem=N kB) + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique2 = 3) OR (unique1 = 1)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique2 (work_mem=N kB) + Index Cond: (unique2 = 3) + -> Bitmap Index Scan on tenk1_unique1 (work_mem=N kB) + Index Cond: (unique1 = 1) + Total Working Memory: N kB +(19 rows) + +select count(*) from ( +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or (a.unique2 = 3 and b.hundred = 4) +); + count +------- + 101 +(1 row) + +-- InitPlan with hash table ("IN SELECT") +select workmem_filter(' +explain (costs off, work_mem on) +select ''foo''::text in (select ''bar''::name union all select ''bar''::name); +'); + workmem_filter +---------------------------- + Result (work_mem=N kB) + SubPlan 1 + -> Append + -> Result + -> Result + Total Working Memory: N kB +(6 rows) + +select 'foo'::text in (select 'bar'::name union all select 'bar'::name); + ?column? +---------- + f +(1 row) + +-- SubPlan with hash table +select workmem_filter(' +explain (costs off, work_mem on) +select 1 = any (select (select 1) where 1 = any (select 1)); +'); + workmem_filter +---------------------------------------------------------------- + Result (work_mem=N kB) + SubPlan 3 + -> Result (work_mem=N kB) + One-Time Filter: (ANY (1 = (hashed SubPlan 2).col1)) + InitPlan 1 + -> Result + SubPlan 2 + -> Result + Total Working Memory: N kB +(9 rows) + +select 1 = any (select (select 1) where 1 = any (select 1)); + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 37b6d21e1f9..1089e3bdf96 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr # The stats test resets stats, so nothing else needing stats access can be in # this group. # ---------- -test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate +test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate workmem # event_trigger depends on create_am and cannot run concurrently with # any test that runs DDL diff --git a/src/test/regress/sql/workmem.sql b/src/test/regress/sql/workmem.sql new file mode 100644 index 00000000000..5878f2aa4c4 --- /dev/null +++ b/src/test/regress/sql/workmem.sql @@ -0,0 +1,303 @@ +---- +-- Tests that show "work_mem" output to EXPLAIN plans. +---- + +-- Note: Function derived from file explain.sql. We can't use that other +-- function, since we're run in parallel with explain.sql. +create or replace function workmem_filter(text) returns setof text +language plpgsql as +$$ +declare + ln text; +begin + for ln in execute $1 + loop + -- Mask out work_mem estimate, since it might be brittle + ln := regexp_replace(ln, '\mwork_mem=\d+\M', 'work_mem=N', 'g'); + ln := regexp_replace(ln, '\mMemory: \d+\M', 'Memory: N', 'g'); + return next ln; + end loop; +end; +$$; + +-- Unique -> hash agg +set enable_hashagg = on; + +select workmem_filter(' +explain (costs off, work_mem on) +select * +from onek +where (unique1,ten) in (values (1,1), (20,0), (99,9), (17,99)) +order by unique1; +'); + +select * +from onek +where (unique1,ten) in (values (1,1), (20,0), (99,9), (17,99)) +order by unique1; + +reset enable_hashagg; + +-- Unique -> sort +set enable_hashagg = off; + +select workmem_filter(' +explain (costs off, work_mem on) +select * +from onek +where (unique1,ten) in (values (1,1), (20,0), (99,9), (17,99)) +order by unique1; +'); + +select * +from onek +where (unique1,ten) in (values (1,1), (20,0), (99,9), (17,99)) +order by unique1; + +reset enable_hashagg; + +-- Incremental Sort +select workmem_filter(' +explain (costs off, work_mem on) +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; +'); + +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; + +-- Hash Join +select workmem_filter(' +explain (costs off, work_mem on) +select count(*) from ( +select t1.unique1, t2.hundred +from onek t1, tenk1 t2 +where exists (select 1 from tenk1 t3 + where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) + and t1.unique1 < 1 +) t; +'); + +select count(*) from ( +select t1.unique1, t2.hundred +from onek t1, tenk1 t2 +where exists (select 1 from tenk1 t3 + where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) + and t1.unique1 < 1 +) t; + +-- Materialize +select workmem_filter(' +explain (costs off, work_mem on) +select count(*) from ( +select t1.f1 +from int4_tbl t1, int4_tbl t2 + left join int4_tbl t3 on t3.f1 > 0 + left join int4_tbl t4 on t3.f1 > 1 +where t4.f1 is null +) t; +'); + +select count(*) from ( +select t1.f1 +from int4_tbl t1, int4_tbl t2 + left join int4_tbl t3 on t3.f1 > 0 + left join int4_tbl t4 on t3.f1 > 1 +where t4.f1 is null +) t; + +-- Grouping Sets (Hash) +select workmem_filter(' +explain (costs off, work_mem on) +select a, b, row_number() over (order by a, b nulls first) +from (values (1, 1), (2, 2)) as t (a, b) where a = b +group by grouping sets((a, b), (a)); +'); + +select a, b, row_number() over (order by a, b nulls first) +from (values (1, 1), (2, 2)) as t (a, b) where a = b +group by grouping sets((a, b), (a)); + +-- Grouping Sets (Sort) +set enable_hashagg = off; + +select workmem_filter(' +explain (costs off, work_mem on) +select a, b, row_number() over (order by a, b nulls first) +from (values (1, 1, 1, 1), (2, 2, 2, 2)) as t (a, b, c, d) where a = b +group by grouping sets((a, b), (a), (b), (c), (d)); +'); + +select a, b, row_number() over (order by a, b nulls first) +from (values (1, 1, 1, 1), (2, 2, 2, 2)) as t (a, b, c, d) where a = b +group by grouping sets((a, b), (a), (b), (c), (d)); + +reset enable_hashagg; + +-- Agg (hash, parallel) +set parallel_setup_cost=0; +set parallel_tuple_cost=0; +set min_parallel_table_scan_size=0; +set max_parallel_workers_per_gather=4; + +select workmem_filter(' +explain (costs off, work_mem on) +select length(stringu1) from tenk1 group by length(stringu1); +'); + +select length(stringu1) from tenk1 group by length(stringu1); + +reset parallel_setup_cost; +reset parallel_tuple_cost; +reset min_parallel_table_scan_size; +reset max_parallel_workers_per_gather; + +-- Agg (simple) [no work_mem] +explain (costs off, work_mem on) +select MAX(length(stringu1)) from tenk1; + +select MAX(length(stringu1)) from tenk1; + +-- Function Scan +select workmem_filter(' +explain (work_mem on, costs off) +select count(*) from ( +select sum(n) over(partition by m) +from (SELECT n < 3 as m, n from generate_series(1,2000) a(n)) +) t; +'); + +select count(*) from ( +select sum(n) over(partition by m) +from (SELECT n < 3 as m, n from generate_series(1,2000) a(n)) +) t; + +-- Three Function Scans +select workmem_filter(' +explain (work_mem on, costs off) +select count(*) +from rows from(generate_series(1, 5), + generate_series(2, 10), + generate_series(4, 15)); +'); + +select count(*) +from rows from(generate_series(1, 5), + generate_series(2, 10), + generate_series(4, 15)); + +-- Table Function Scan +select workmem_filter(' +EXPLAIN (COSTS OFF, work_mem on) +SELECT xmltable.* + FROM (SELECT data FROM xmldata) x, + LATERAL XMLTABLE(''/ROWS/ROW'' + PASSING data + COLUMNS id int PATH ''@id'', + _id FOR ORDINALITY, + country_name text PATH ''COUNTRY_NAME'' NOT NULL, + country_id text PATH ''COUNTRY_ID'', + region_id int PATH ''REGION_ID'', + size float PATH ''SIZE'', + unit text PATH ''SIZE/@unit'', + premier_name text PATH ''PREMIER_NAME'' DEFAULT ''not specified''); +'); + +SELECT xmltable.* + FROM (SELECT data FROM xmldata) x, + LATERAL XMLTABLE('/ROWS/ROW' + PASSING data + COLUMNS id int PATH '@id', + _id FOR ORDINALITY, + country_name text PATH 'COUNTRY_NAME' NOT NULL, + country_id text PATH 'COUNTRY_ID', + region_id int PATH 'REGION_ID', + size float PATH 'SIZE', + unit text PATH 'SIZE/@unit', + premier_name text PATH 'PREMIER_NAME' DEFAULT 'not specified'); + +-- SetOp [no work_mem] +explain (costs off, work_mem on) +select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10; + +select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10; + +-- HashSetOp +select workmem_filter(' +explain (costs off, work_mem on) +select count(*) from + ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss; +'); + +select count(*) from + ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss; + +-- RecursiveUnion and Memoize (also WorkTable Scan [no work_mem]) +select workmem_filter(' +explain (costs off, work_mem on) +select sum(o.four), sum(ss.a) from onek o +cross join lateral (with recursive x(a) as ( + select o.four as a union select a + 1 from x where a < 10) + select * from x) ss where o.ten = 1; +'); + +select sum(o.four), sum(ss.a) from onek o +cross join lateral (with recursive x(a) as ( + select o.four as a union select a + 1 from x where a < 10) + select * from x) ss where o.ten = 1; + +-- CTE Scan +select workmem_filter(' +explain (costs off, work_mem on) +WITH q1(x,y) AS ( + SELECT hundred, sum(ten) FROM tenk1 GROUP BY hundred + ) +SELECT count(*) FROM q1 WHERE y > (SELECT sum(y)/100 FROM q1 qsub); +'); + +WITH q1(x,y) AS ( + SELECT hundred, sum(ten) FROM tenk1 GROUP BY hundred + ) +SELECT count(*) FROM q1 WHERE y > (SELECT sum(y)/100 FROM q1 qsub); + +-- WindowAgg +select workmem_filter(' +explain (costs off, work_mem on) +select sum(n) over(partition by m) +from (SELECT n < 3 as m, n from generate_series(1,2000) a(n)) +limit 5; +'); + +select sum(n) over(partition by m) +from (SELECT n < 3 as m, n from generate_series(1,2000) a(n)) +limit 5; + +-- Bitmap Heap Scan +select workmem_filter(' +explain (costs off, work_mem on) +select count(*) from ( +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or (a.unique2 = 3 and b.hundred = 4) +); +'); + +select count(*) from ( +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or (a.unique2 = 3 and b.hundred = 4) +); + +-- InitPlan with hash table ("IN SELECT") +select workmem_filter(' +explain (costs off, work_mem on) +select ''foo''::text in (select ''bar''::name union all select ''bar''::name); +'); + +select 'foo'::text in (select 'bar'::name union all select 'bar'::name); + +-- SubPlan with hash table +select workmem_filter(' +explain (costs off, work_mem on) +select 1 = any (select (select 1) where 1 = any (select 1)); +'); + +select 1 = any (select (select 1) where 1 = any (select 1)); -- 2.47.1