From bece0df3261a889a452f7b0eb1e85b58b19df9ab Mon Sep 17 00:00:00 2001 From: Sergey Soloviev Date: Wed, 3 Dec 2025 17:34:18 +0300 Subject: [PATCH v2 3/4] make use of IndexAggregate in planner and explain This commit adds usage of IndexAggregate in planner and explain (analyze). We calculate cost of IndexAggregate and add AGG_INDEX node to the pathlist. Cost of this node is cost of building B+tree (in memory), disk spill and final external merge. For EXPLAIN there is only little change - show sort information in "Group Key". --- src/backend/commands/explain.c | 101 ++++++++++++--- src/backend/optimizer/path/costsize.c | 121 ++++++++++++------ src/backend/optimizer/plan/createplan.c | 15 ++- src/backend/optimizer/plan/planner.c | 35 +++++ src/backend/optimizer/util/pathnode.c | 9 ++ src/backend/utils/misc/guc_parameters.dat | 7 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/nodes/pathnodes.h | 3 +- src/include/optimizer/cost.h | 1 + 9 files changed, 237 insertions(+), 56 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 5a6390631eb..9e16c547b06 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -134,7 +134,7 @@ static void show_recursive_union_info(RecursiveUnionState *rstate, ExplainState *es); static void show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es); -static void show_hashagg_info(AggState *aggstate, ExplainState *es); +static void show_agg_spill_info(AggState *aggstate, ExplainState *es); static void show_indexsearches_info(PlanState *planstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); @@ -1556,6 +1556,10 @@ ExplainNode(PlanState *planstate, List *ancestors, pname = "MixedAggregate"; strategy = "Mixed"; break; + case AGG_INDEX: + pname = "IndexAggregate"; + strategy = "Indexed"; + break; default: pname = "Aggregate ???"; strategy = "???"; @@ -2200,7 +2204,7 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_Agg: show_agg_keys(castNode(AggState, planstate), ancestors, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); - show_hashagg_info((AggState *) planstate, es); + show_agg_spill_info((AggState *) planstate, es); if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); @@ -2631,6 +2635,24 @@ show_agg_keys(AggState *astate, List *ancestors, if (plan->groupingSets) show_grouping_sets(outerPlanState(astate), plan, ancestors, es); + else if (plan->aggstrategy == AGG_INDEX) + { + Sort *sort = astate->index_sort; + + /* + * Index Agg reorders GROUP BY keys to match ORDER BY + * so they must be the same, but we should show other + * useful information about used ordering, such as direction. + */ + Assert(sort != NULL); + show_sort_group_keys(outerPlanState(astate), "Group Key", + plan->numCols, 0, + sort->sortColIdx, + sort->sortOperators, + sort->collations, + sort->nullsFirst, + ancestors, es); + } else show_sort_group_keys(outerPlanState(astate), "Group Key", plan->numCols, 0, plan->grpColIdx, @@ -3735,47 +3757,67 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) } /* - * Show information on hash aggregate memory usage and batches. + * Show information on hash or index aggregate memory usage and batches. */ static void -show_hashagg_info(AggState *aggstate, ExplainState *es) +show_agg_spill_info(AggState *aggstate, ExplainState *es) { Agg *agg = (Agg *) aggstate->ss.ps.plan; - int64 memPeakKb = BYTES_TO_KILOBYTES(aggstate->hash_mem_peak); + int64 memPeakKb = BYTES_TO_KILOBYTES(aggstate->spill_mem_peak); if (agg->aggstrategy != AGG_HASHED && - agg->aggstrategy != AGG_MIXED) + agg->aggstrategy != AGG_MIXED && + agg->aggstrategy != AGG_INDEX) return; if (es->format != EXPLAIN_FORMAT_TEXT) { if (es->costs) ExplainPropertyInteger("Planned Partitions", NULL, - aggstate->hash_planned_partitions, es); + aggstate->spill_planned_partitions, es); /* * During parallel query the leader may have not helped out. We * detect this by checking how much memory it used. If we find it * didn't do any work then we don't show its properties. */ - if (es->analyze && aggstate->hash_mem_peak > 0) + if (es->analyze && aggstate->spill_mem_peak > 0) { ExplainPropertyInteger("HashAgg Batches", NULL, - aggstate->hash_batches_used, es); + aggstate->spill_batches_used, es); ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es); ExplainPropertyInteger("Disk Usage", "kB", - aggstate->hash_disk_used, es); + aggstate->spill_disk_used, es); + } + + if ( es->analyze + && aggstate->aggstrategy == AGG_INDEX + && aggstate->mergestate != NULL) + { + TuplesortInstrumentation stats; + const char *mergeMethod; + const char *spaceType; + int64 spaceUsed; + + tuplesort_get_stats(aggstate->mergestate, &stats); + mergeMethod = tuplesort_method_name(stats.sortMethod); + spaceType = tuplesort_space_type_name(stats.spaceType); + spaceUsed = stats.spaceUsed; + + ExplainPropertyText("Merge Method", mergeMethod, es); + ExplainPropertyInteger("Merge Space Used", "kB", spaceUsed, es); + ExplainPropertyText("Merge Space Type", spaceType, es); } } else { bool gotone = false; - if (es->costs && aggstate->hash_planned_partitions > 0) + if (es->costs && aggstate->spill_planned_partitions > 0) { ExplainIndentText(es); appendStringInfo(es->str, "Planned Partitions: %d", - aggstate->hash_planned_partitions); + aggstate->spill_planned_partitions); gotone = true; } @@ -3784,7 +3826,7 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) * detect this by checking how much memory it used. If we find it * didn't do any work then we don't show its properties. */ - if (es->analyze && aggstate->hash_mem_peak > 0) + if (es->analyze && aggstate->spill_mem_peak > 0) { if (!gotone) ExplainIndentText(es); @@ -3792,17 +3834,44 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) appendStringInfoSpaces(es->str, 2); appendStringInfo(es->str, "Batches: %d Memory Usage: " INT64_FORMAT "kB", - aggstate->hash_batches_used, memPeakKb); + aggstate->spill_batches_used, memPeakKb); gotone = true; /* Only display disk usage if we spilled to disk */ - if (aggstate->hash_batches_used > 1) + if (aggstate->spill_batches_used > 1) { appendStringInfo(es->str, " Disk Usage: " UINT64_FORMAT "kB", - aggstate->hash_disk_used); + aggstate->spill_disk_used); } } + /* For index aggregate show stats for final merging */ + if ( es->analyze + && aggstate->aggstrategy == AGG_INDEX + && aggstate->mergestate != NULL) + { + TuplesortInstrumentation stats; + const char *mergeMethod; + const char *spaceType; + int64 spaceUsed; + + tuplesort_get_stats(aggstate->mergestate, &stats); + mergeMethod = tuplesort_method_name(stats.sortMethod); + spaceType = tuplesort_space_type_name(stats.spaceType); + spaceUsed = stats.spaceUsed; + + /* + * If we are here that means that previous check (for mem peak) was + * successfull (can not directly go to merge without any in-memory + * operations). Do not check other state and just start a new line. + */ + appendStringInfoChar(es->str, '\n'); + ExplainIndentText(es); + appendStringInfo(es->str, "Merge Method: %s %s: " INT64_FORMAT "kB", + mergeMethod, spaceType, spaceUsed); + gotone = true; + } + if (gotone) appendStringInfoChar(es->str, '\n'); } diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index a39cc793b4d..a0af4e76f42 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -150,6 +150,7 @@ bool enable_tidscan = true; bool enable_sort = true; bool enable_incremental_sort = true; bool enable_hashagg = true; +bool enable_indexagg = true; bool enable_nestloop = true; bool enable_material = true; bool enable_memoize = true; @@ -1848,6 +1849,32 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) rterm->pathtarget->width); } +/* + * cost_tuplemerge + * Determines and returns the cost of external merge used in tuplesort. + */ +static void +cost_tuplemerge(double availMem, double input_bytes, double ntuples, + Cost comparison_cost, Cost *cost) +{ + double npages = ceil(input_bytes / BLCKSZ); + double nruns = input_bytes / availMem; + double mergeorder = tuplesort_merge_order(availMem); + double log_runs; + double npageaccesses; + + /* Compute logM(r) as log(r) / log(M) */ + if (nruns > mergeorder) + log_runs = ceil(log(nruns) / log(mergeorder)); + else + log_runs = 1.0; + + npageaccesses = 2.0 * npages * log_runs; + + /* Assume 3/4ths of accesses are sequential, 1/4th are not */ + *cost += npageaccesses * (seq_page_cost * 0.75 + random_page_cost * 0.25); +} + /* * cost_tuplesort * Determines and returns the cost of sorting a relation using tuplesort, @@ -1922,11 +1949,6 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost, /* * We'll have to use a disk-based sort of all the tuples */ - double npages = ceil(input_bytes / BLCKSZ); - double nruns = input_bytes / sort_mem_bytes; - double mergeorder = tuplesort_merge_order(sort_mem_bytes); - double log_runs; - double npageaccesses; /* * CPU costs @@ -1936,16 +1958,8 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost, *startup_cost = comparison_cost * tuples * LOG2(tuples); /* Disk costs */ - - /* Compute logM(r) as log(r) / log(M) */ - if (nruns > mergeorder) - log_runs = ceil(log(nruns) / log(mergeorder)); - else - log_runs = 1.0; - npageaccesses = 2.0 * npages * log_runs; - /* Assume 3/4ths of accesses are sequential, 1/4th are not */ - *startup_cost += npageaccesses * - (seq_page_cost * 0.75 + random_page_cost * 0.25); + cost_tuplemerge(sort_mem_bytes, input_bytes, tuples, comparison_cost, + startup_cost); } else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes) { @@ -2770,7 +2784,7 @@ cost_agg(Path *path, PlannerInfo *root, total_cost += cpu_tuple_cost * numGroups; output_tuples = numGroups; } - else + else if (aggstrategy == AGG_HASHED) { /* must be AGG_HASHED */ startup_cost = input_total_cost; @@ -2788,6 +2802,46 @@ cost_agg(Path *path, PlannerInfo *root, total_cost += cpu_tuple_cost * numGroups; output_tuples = numGroups; } + else + { + /* must be AGG_INDEX */ + startup_cost = input_total_cost; + if (!enable_indexagg) + ++disabled_nodes; + + startup_cost += aggcosts->transCost.startup; + startup_cost += aggcosts->transCost.per_tuple * input_tuples; + /* cost of btree comparison */ + startup_cost += input_tuples * (2.0 * cpu_operator_cost * numGroupCols); + startup_cost += aggcosts->finalCost.startup; + + total_cost = startup_cost; + total_cost += aggcosts->finalCost.per_tuple * numGroups; + /* cost of retrieving from index */ + total_cost += cpu_tuple_cost * numGroups; + output_tuples = numGroups; + } + + /* + * If there are quals (HAVING quals), account for their cost and + * selectivity. Process it before disk spill logic, because output + * cardinality is required for AGG_INDEX. + */ + if (quals) + { + QualCost qual_cost; + + cost_qual_eval(&qual_cost, quals, root); + startup_cost += qual_cost.startup; + total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple; + + output_tuples = clamp_row_est(output_tuples * + clauselist_selectivity(root, + quals, + 0, + JOIN_INNER, + NULL)); + } /* * Add the disk costs of hash aggregation that spills to disk. @@ -2802,7 +2856,7 @@ cost_agg(Path *path, PlannerInfo *root, * Accrue writes (spilled tuples) to startup_cost and to total_cost; * accrue reads only to total_cost. */ - if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED) + if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED || aggstrategy == AGG_INDEX) { double pages; double pages_written = 0.0; @@ -2823,8 +2877,8 @@ cost_agg(Path *path, PlannerInfo *root, hashentrysize = hash_agg_entry_size(list_length(root->aggtransinfos), input_width, aggcosts->transitionSpace); - hash_agg_set_limits(hashentrysize, numGroups, 0, &mem_limit, - &ngroups_limit, &num_partitions); + agg_set_limits(hashentrysize, numGroups, 0, &mem_limit, + &ngroups_limit, &num_partitions); nbatches = Max((numGroups * hashentrysize) / mem_limit, numGroups / ngroups_limit); @@ -2861,26 +2915,21 @@ cost_agg(Path *path, PlannerInfo *root, spill_cost = depth * input_tuples * 2.0 * cpu_tuple_cost; startup_cost += spill_cost; total_cost += spill_cost; - } - /* - * If there are quals (HAVING quals), account for their cost and - * selectivity. - */ - if (quals) - { - QualCost qual_cost; + /* IndexAgg requires final external merge stage */ + if (aggstrategy == AGG_INDEX) + { + double output_bytes; + Cost comparison_cost; - cost_qual_eval(&qual_cost, quals, root); - startup_cost += qual_cost.startup; - total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple; + /* size of all projected tuples */ + output_bytes = path->pathtarget->width * output_tuples; + /* default comparison cost */ + comparison_cost = 2.0 * cpu_operator_cost; - output_tuples = clamp_row_est(output_tuples * - clauselist_selectivity(root, - quals, - 0, - JOIN_INNER, - NULL)); + cost_tuplemerge(work_mem, output_bytes, output_tuples, + comparison_cost, &startup_cost); + } } path->rows = output_tuples; diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index bc417f93840..de9bb1ef30b 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -2158,6 +2158,8 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path) Plan *subplan; List *tlist; List *quals; + List *chain; + AttrNumber *grpColIdx; /* * Agg can project, so no need to be terribly picky about child tlist, but @@ -2169,17 +2171,24 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path) quals = order_qual_clauses(root, best_path->qual); + grpColIdx = extract_grouping_cols(best_path->groupClause, subplan->targetlist); + + /* For index aggregation we should consider the desired sorting order. */ + if (best_path->aggstrategy == AGG_INDEX) + chain = list_make1(make_sort_from_groupcols(best_path->groupClause, grpColIdx, subplan)); + else + chain = NIL; + plan = make_agg(tlist, quals, best_path->aggstrategy, best_path->aggsplit, list_length(best_path->groupClause), - extract_grouping_cols(best_path->groupClause, - subplan->targetlist), + grpColIdx, extract_grouping_ops(best_path->groupClause), extract_grouping_collations(best_path->groupClause, subplan->targetlist), NIL, - NIL, + chain, best_path->numGroups, best_path->transitionSpace, subplan); diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 8b22c30559b..cfd2f3ff3a9 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -3877,6 +3877,21 @@ create_grouping_paths(PlannerInfo *root, (gd ? gd->any_hashable : grouping_is_hashable(root->processed_groupClause)))) flags |= GROUPING_CAN_USE_HASH; + /* + * Determine whether we should consider index-based implementation of + * grouping. + * + * This is more restrictive since it not only must be sortable (for + * purposes of Btree), but also must be hashable, so we can effectively + * spill tuples and later process each batch. + */ + if ( gd == NULL + && root->numOrderedAggs == 0 + && parse->groupClause != NIL + && grouping_is_sortable(root->processed_groupClause) + && grouping_is_hashable(root->processed_groupClause)) + flags |= GROUPING_CAN_USE_INDEX; + /* * Determine whether partial aggregation is possible. */ @@ -7108,6 +7123,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, ListCell *lc; bool can_hash = (extra->flags & GROUPING_CAN_USE_HASH) != 0; bool can_sort = (extra->flags & GROUPING_CAN_USE_SORT) != 0; + bool can_index = (extra->flags & GROUPING_CAN_USE_INDEX) != 0; List *havingQual = (List *) extra->havingQual; AggClauseCosts *agg_final_costs = &extra->agg_final_costs; double dNumGroups = 0; @@ -7329,6 +7345,25 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, } } + if (can_index) + { + /* + * Generate IndexAgg path. + */ + Assert(!parse->groupingSets); + add_path(grouped_rel, (Path *) + create_agg_path(root, + grouped_rel, + cheapest_path, + grouped_rel->reltarget, + AGG_INDEX, + AGGSPLIT_SIMPLE, + root->processed_groupClause, + havingQual, + agg_costs, + dNumGroups)); + } + /* * When partitionwise aggregate is used, we might have fully aggregated * paths in the partial pathlist, because add_paths_to_append_rel() will diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index b6be4ddbd01..2bac26055a7 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -3030,6 +3030,15 @@ create_agg_path(PlannerInfo *root, else pathnode->path.pathkeys = subpath->pathkeys; /* preserves order */ } + else if (aggstrategy == AGG_INDEX) + { + /* + * When using index aggregation all grouping columns will be used as + * comparator keys, so output is always sorted. + */ + pathnode->path.pathkeys = make_pathkeys_for_sortclauses(root, groupClause, + root->processed_tlist); + } else pathnode->path.pathkeys = NIL; /* output is unordered */ diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 3b9d8349078..776ccd9e2fd 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -868,6 +868,13 @@ boot_val => 'true', }, +{ name => 'enable_indexagg', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of index aggregation plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_indexagg', + boot_val => 'true', +}, + { name => 'enable_indexonlyscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', short_desc => 'Enables the planner\'s use of index-only-scan plans.', flags => 'GUC_EXPLAIN', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index dc9e2255f8a..307b9ee660d 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -410,6 +410,7 @@ #enable_hashagg = on #enable_hashjoin = on #enable_incremental_sort = on +#enable_indexagg = on #enable_indexscan = on #enable_indexonlyscan = on #enable_material = on diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 46a8655621d..f4b2d35b1d9 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -3518,7 +3518,8 @@ typedef struct JoinPathExtraData */ #define GROUPING_CAN_USE_SORT 0x0001 #define GROUPING_CAN_USE_HASH 0x0002 -#define GROUPING_CAN_PARTIAL_AGG 0x0004 +#define GROUPING_CAN_USE_INDEX 0x0004 +#define GROUPING_CAN_PARTIAL_AGG 0x0008 /* * What kind of partitionwise aggregation is in use? diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index b523bcda8f3..5d03b5971bd 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -57,6 +57,7 @@ extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; extern PGDLLIMPORT bool enable_incremental_sort; extern PGDLLIMPORT bool enable_hashagg; +extern PGDLLIMPORT bool enable_indexagg; extern PGDLLIMPORT bool enable_nestloop; extern PGDLLIMPORT bool enable_material; extern PGDLLIMPORT bool enable_memoize; -- 2.43.0