From 6e9e693b6d2f8899774c1c798f634cdf60e0f286 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 6 Feb 2026 14:51:23 -0500 Subject: [PATCH v55 2/2] Consider startup cost as a figure of merit for partial paths. Previously, the coments stated that there was no purpose to considering startup cost for partial paths, but this is not the case: it's perfectly reasonable to want a fast-start path for a plan that involves a LIMIT (perhaps over an aggregate, so that there is enough data being processed to justify parallel query but yet we don't want all the result rows). Accordingly, rewrite add_partial_path and add_partial_path_precheck to consider startup costs. This also fixes an independent bug in add_partial_path_precheck: commit e22253467942fdb100087787c3e1e3a8620c54b2 failed to update it to do anything with the new disabled_nodes field. That bug fix is formally separate from the rest of this patch and could be committed separately, but I think it makes more sense to fix both issues together, because then we can (as this commit does) just make add_partial_path_precheck do the cost comparisons in the same way as compare_path_costs_fuzzily, which hopefully reduces the chances of ending up with something that's still incorrect. This patch is based on earlier work on this topic by Tomas Vondra, but I have rewritten a great deal of it. Co-authored-by: Robert Haas Co-authored-by: Tomas Vondra --- src/backend/optimizer/path/joinpath.c | 3 + src/backend/optimizer/util/pathnode.c | 159 +++++++++++------- src/include/optimizer/pathnode.h | 2 +- .../regress/expected/incremental_sort.out | 18 +- src/test/regress/expected/join_hash.out | 13 +- src/test/regress/sql/join_hash.sql | 10 +- 6 files changed, 122 insertions(+), 83 deletions(-) diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 1e4246b49d5..909584ebc21 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -1048,6 +1048,7 @@ try_partial_nestloop_path(PlannerInfo *root, initial_cost_nestloop(root, &workspace, jointype, nestloop_subtype, outer_path, inner_path, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1237,6 +1238,7 @@ try_partial_mergejoin_path(PlannerInfo *root, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1369,6 +1371,7 @@ try_partial_hashjoin_path(PlannerInfo *root, initial_cost_hashjoin(root, &workspace, jointype, hashclauses, outer_path, inner_path, extra, parallel_hash); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, NIL)) return; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index aab2570f159..457dacedceb 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -777,10 +777,9 @@ add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * * Because we don't consider parameterized paths here, we also don't * need to consider the row counts as a measure of quality: every path will - * produce the same number of rows. Neither do we need to consider startup - * costs: parallelism is only used for plans that will be run to completion. - * Therefore, this routine is much simpler than add_path: it needs to - * consider only disabled nodes, pathkeys and total cost. + * produce the same number of rows. However, we do need to consider the + * startup costs: this partial path could be used beneath a Limit node, + * so a fast-start plan could be correct. * * As with add_path, we pfree paths that are found to be dominated by * another partial path; this requires that there be no other references to @@ -818,52 +817,36 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) /* Compare pathkeys. */ keyscmp = compare_pathkeys(new_path->pathkeys, old_path->pathkeys); - /* Unless pathkeys are incompatible, keep just one of the two paths. */ + /* + * Unless pathkeys are incompatible, see if one of the paths dominates + * the other (both in startup and total cost). It may happen that one + * path has lower startup cost, the other has lower total cost. + */ if (keyscmp != PATHKEYS_DIFFERENT) { - if (unlikely(new_path->disabled_nodes != old_path->disabled_nodes)) + PathCostComparison costcmp; + + /* + * Do a fuzzy cost comparison with standard fuzziness limit. + */ + costcmp = compare_path_costs_fuzzily(new_path, old_path, + STD_FUZZ_FACTOR); + if (costcmp == COSTS_BETTER1) { - if (new_path->disabled_nodes > old_path->disabled_nodes) - accept_new = false; - else + if (keyscmp == PATHKEYS_BETTER1) remove_old = true; } - else if (new_path->total_cost > old_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_BETTER2) { - /* New path costs more; keep it only if pathkeys are better. */ - if (keyscmp != PATHKEYS_BETTER1) + if (keyscmp == PATHKEYS_BETTER2) accept_new = false; } - else if (old_path->total_cost > new_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_EQUAL) { - /* Old path costs more; keep it only if pathkeys are better. */ - if (keyscmp != PATHKEYS_BETTER2) + if (keyscmp == PATHKEYS_BETTER1) remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER1) - { - /* Costs are about the same, new path has better pathkeys. */ - remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER2) - { - /* Costs are about the same, old path has better pathkeys. */ - accept_new = false; - } - else if (old_path->total_cost > new_path->total_cost * 1.0000000001) - { - /* Pathkeys are the same, and the old path costs more. */ - remove_old = true; - } - else - { - /* - * Pathkeys are the same, and new path isn't materially - * cheaper. - */ - accept_new = false; + else if (keyscmp == PATHKEYS_BETTER2) + accept_new = false; } } @@ -914,16 +897,16 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) * add_partial_path_precheck * Check whether a proposed new partial path could possibly get accepted. * - * Unlike add_path_precheck, we can ignore startup cost and parameterization, - * since they don't matter for partial paths (see add_partial_path). But - * we do want to make sure we don't add a partial path if there's already - * a complete path that dominates it, since in that case the proposed path - * is surely a loser. + * Unlike add_path_precheck, we can ignore parameterization, since it doesn't + * matter for partial paths (see add_partial_path). But we do want to make + * sure we don't add a partial path if there's already a complete path that + * dominates it, since in that case the proposed path is surely a loser. */ bool add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, - Cost total_cost, List *pathkeys) + Cost startup_cost, Cost total_cost, List *pathkeys) { + bool consider_startup = parent_rel->consider_startup; ListCell *p1; /* @@ -933,25 +916,80 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * is clearly superior to some existing partial path -- at least, modulo * final cost computations. If so, we definitely want to consider it. * - * Unlike add_path(), we always compare pathkeys here. This is because we - * expect partial_pathlist to be very short, and getting a definitive + * Unlike add_path(), we never try to exit this loop early. This is because + * we expect partial_pathlist to be very short, and getting a definitive * answer at this stage avoids the need to call add_path_precheck. */ foreach(p1, parent_rel->partial_pathlist) { Path *old_path = (Path *) lfirst(p1); + PathCostComparison costcmp; PathKeysComparison keyscmp; - keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); - if (keyscmp != PATHKEYS_DIFFERENT) + /* + * First, compare costs and disabled nodes. This logic should be + * identical to compare_path_costs_fuzzily, except that one of the + * paths hasn't been created yet, and the fuzz factor is always + * STD_FUZZ_FACTOR. + */ + if (unlikely(old_path->disabled_nodes != disabled_nodes)) { - if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER1) - return false; - if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER2) - return true; + if (disabled_nodes < old_path->disabled_nodes) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_BETTER2; + } + else if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR) + { + if (consider_startup && + old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER2; + } + else if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR) + { + if (consider_startup && + startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER1; } + else if (startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER2; + else if (old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_EQUAL; + + /* + * If one path wins on startup cost and the other on total cost, we + * can't say for sure which is better. + */ + if (costcmp == COSTS_DIFFERENT) + continue; + + /* + * If the two paths have different pathkeys, we can't say for sure + * which is better. + */ + keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); + if (keyscmp == PATHKEYS_DIFFERENT) + continue; + + /* + * If the existing path is cheaper and the pathkeys are equal or worse, + * the new path is not interesting. + */ + if (costcmp == COSTS_BETTER2 && keyscmp != PATHKEYS_BETTER1) + return false; + + /* + * If the new path is cheaper and the pathkeys are equal or better, + * it is definitely interesting. + */ + if (costcmp == COSTS_BETTER1 && keyscmp != PATHKEYS_BETTER2) + return true; } /* @@ -959,14 +997,9 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * clearly good enough that it might replace one. Compare it to * non-parallel plans. If it loses even before accounting for the cost of * the Gather node, we should definitely reject it. - * - * Note that we pass the total_cost to add_path_precheck twice. This is - * because it's never advantageous to consider the startup cost of a - * partial path; the resulting plans, if run in parallel, will be run to - * completion. */ - if (!add_path_precheck(parent_rel, disabled_nodes, total_cost, total_cost, - pathkeys, NULL)) + if (!add_path_precheck(parent_rel, disabled_nodes, startup_cost, + total_cost, pathkeys, NULL)) return false; return true; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 224750859c3..7abc919d894 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -41,7 +41,7 @@ extern bool add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, List *pathkeys, Relids required_outer); extern void add_partial_path(RelOptInfo *parent_rel, Path *new_path); extern bool add_partial_path_precheck(RelOptInfo *parent_rel, - int disabled_nodes, + int disabled_nodes, Cost startup_cost, Cost total_cost, List *pathkeys); extern Path *create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index fdec5b9ba52..29090dca1ba 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -1450,21 +1450,23 @@ explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1 set enable_incremental_sort = on; explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; - QUERY PLAN ----------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------- Limit -> Incremental Sort Sort Key: a, b, (sum(c)) Presorted Key: a, b - -> GroupAggregate + -> Finalize GroupAggregate Group Key: a, b -> Gather Merge Workers Planned: 2 - -> Incremental Sort - Sort Key: a, b - Presorted Key: a - -> Parallel Index Scan using t_a_idx on t -(12 rows) + -> Partial GroupAggregate + Group Key: a, b + -> Incremental Sort + Sort Key: a, b + Presorted Key: a + -> Parallel Index Scan using t_a_idx on t +(14 rows) -- Incremental sort vs. set operations with varno 0 set enable_hashagg to off; diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out index 4749f6ed70d..bc7cc76467f 100644 --- a/src/test/regress/expected/join_hash.out +++ b/src/test/regress/expected/join_hash.out @@ -76,8 +76,8 @@ insert into extremely_skewed update pg_class set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 where relname = 'extremely_skewed'; --- Make a relation with a couple of enormous tuples. -create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +-- Make a relation with several enormous tuples. +create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t; alter table wide set (parallel_workers = 2); -- The "optimal" case: the hash table fits in memory; we plan for 1 -- batch, we stick to that number, and peak memory usage stays within @@ -922,7 +922,7 @@ set work_mem = '128kB'; set hash_mem_multiplier = 1.0; explain (costs off) select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); QUERY PLAN ---------------------------------------------------------------- Finalize Aggregate @@ -934,10 +934,11 @@ explain (costs off) -> Parallel Seq Scan on wide -> Parallel Hash -> Parallel Seq Scan on wide wide_1 -(9 rows) + Filter: (id < 3) +(10 rows) select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); length -------- 320000 @@ -947,7 +948,7 @@ select final > 1 as multibatch from hash_join_batches( $$ select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); $$); multibatch ------------ diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql index 49d3fd61856..53db1754bb2 100644 --- a/src/test/regress/sql/join_hash.sql +++ b/src/test/regress/sql/join_hash.sql @@ -83,8 +83,8 @@ update pg_class set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 where relname = 'extremely_skewed'; --- Make a relation with a couple of enormous tuples. -create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +-- Make a relation with several enormous tuples. +create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t; alter table wide set (parallel_workers = 2); -- The "optimal" case: the hash table fits in memory; we plan for 1 @@ -496,14 +496,14 @@ set work_mem = '128kB'; set hash_mem_multiplier = 1.0; explain (costs off) select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); select final > 1 as multibatch from hash_join_batches( $$ select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); $$); rollback to settings; -- 2.51.0