From 3f4247b23175388f8c6ee43740fb641d97e39d0b Mon Sep 17 00:00:00 2001 From: "Andrey V. Lepikhov" Date: Tue, 24 May 2022 15:59:02 +0500 Subject: [PATCH] Transform correlated subquery of type N-J [1] into ordinary join query. With many restrictions, check each subquery and pull up expressions, references upper query block. Works for operators '=' and 'IN'. Machinery of converting ANY subquery to JOIN stays the same with minor changes in walker function. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [1] Kim, Won. “On optimizing an SQL-like nested query.” ACM Trans. Database Syst. 7 (1982): 443-469. --- src/backend/optimizer/plan/subselect.c | 307 +++++++++++++++++++- src/backend/optimizer/util/tlist.c | 2 +- src/backend/optimizer/util/var.c | 8 + src/backend/utils/misc/guc.c | 10 + src/include/optimizer/optimizer.h | 1 + src/include/optimizer/tlist.h | 1 + src/test/regress/expected/prepare.out | 18 ++ src/test/regress/expected/subselect.out | 369 ++++++++++++++++++++++++ src/test/regress/sql/prepare.sql | 7 + src/test/regress/sql/subselect.sql | 140 +++++++++ 10 files changed, 855 insertions(+), 8 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 92e3338584..0e7ddc4a4e 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -32,6 +32,7 @@ #include "optimizer/planner.h" #include "optimizer/prep.h" #include "optimizer/subselect.h" +#include "optimizer/tlist.h" #include "parser/parse_relation.h" #include "rewrite/rewriteManip.h" #include "utils/builtins.h" @@ -65,6 +66,8 @@ typedef struct inline_cte_walker_context } inline_cte_walker_context; +bool optimize_correlated_subqueries = true; + static Node *build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot, List *plan_params, SubLinkType subLinkType, int subLinkId, @@ -1229,6 +1232,277 @@ inline_cte_walker(Node *node, inline_cte_walker_context *context) return expression_tree_walker(node, inline_cte_walker, context); } +static bool +contain_placeholders(Node *node, inline_cte_walker_context *context) +{ + if (node == NULL) + return false; + + if (IsA(node, Query)) + return query_tree_walker((Query *) node, contain_placeholders, context, 0); + else if (IsA(node, PlaceHolderVar)) + { + return true; + } + + return expression_tree_walker(node, contain_placeholders, context); +} + +/* + * To be pullable all clauses of flattening correlated subquery should be + * anded and mergejoinable (XXX: really necessary?) + */ +static bool +quals_is_pullable(Node *quals) +{ + if (!contain_vars_of_level(quals, 1)) + return true; + + if (quals && IsA(quals, OpExpr)) + { + OpExpr *expr = (OpExpr *) quals; + Node *leftarg; + + /* Contains only one expression */ + leftarg = linitial(expr->args); + if (!op_mergejoinable(expr->opno, exprType(leftarg))) /* Is it really necessary ? */ + return false; + + if (contain_placeholders(quals, NULL)) + return false; + + return true; + } + else if (is_andclause(quals)) + { + ListCell *l; + + foreach(l, ((BoolExpr *) quals)->args) + { + Node *andarg = (Node *) lfirst(l); + + if (!IsA(andarg, OpExpr)) + return false; + if (!quals_is_pullable(andarg)) + return false; + } + + return true; + } + + return false; +} + +typedef struct +{ + Query *subquery; + int newvarno; + List *pulling_quals; + bool varlevel_up; +} correlated_t; + +static Node * +pull_subquery_clauses_mutator(Node *node, correlated_t *ctx) +{ + if (node == NULL) + return NULL; + + if (IsA(node, OpExpr) && !ctx->varlevel_up) + { + if (!contain_vars_of_level(node, 1)) + return node; + + /* + * The expression contains links to upper relation. It will be pulled to + * uplevel. All links into vars of upper levels must be changed. + */ + + ctx->varlevel_up = true; + ctx->pulling_quals = + lappend(ctx->pulling_quals, + expression_tree_mutator(node, + pull_subquery_clauses_mutator, + (void *) ctx)); + ctx->varlevel_up = false; + + /* Replace position of pulled expression by the 'true' value */ + return makeBoolConst(true, false); + } + if (IsA(node, PlaceHolderVar)) + { + PlaceHolderVar *phv = (PlaceHolderVar *) node; + + if (ctx->varlevel_up && phv->phlevelsup > 0) + phv->phlevelsup--; + /* fall through to recurse into argument */ + } + else if (IsA(node, RangeTblEntry)) + { + RangeTblEntry *rte = (RangeTblEntry *) node; + + if (rte->rtekind == RTE_CTE) + { + if (rte->ctelevelsup > 0 && ctx->varlevel_up) + rte->ctelevelsup--; + } + return node; + } + else if (IsA(node, Aggref)) + { + if (((Aggref *) node)->agglevelsup > 0 && ctx->varlevel_up) + ((Aggref *) node)->agglevelsup--; + return node; + } + else if (IsA(node, GroupingFunc)) + { + if (((GroupingFunc *) node)->agglevelsup > 0 && ctx->varlevel_up) + ((GroupingFunc *) node)->agglevelsup--; + return node; + } + else if (IsA(node, Var)) + { + Var *var = (Var *) node; + + Assert(ctx->varlevel_up); + + /* An upper relation variable */ + if (var->varlevelsup > 0) + { + /* + * Isn't needed to copy node or change varno because it correctly + * refers to Table Entry of a parent and already removed from + * the subquery clauses list. + */ + var->varlevelsup--; + + return (Node *) var; + } + else + { + Var *newvar; + TargetEntry *tle; + + /* + * The var refers to subquery table entry. Include a copy the var + * into the target list, if necessary. Arrange varattno of the + * new var of upper relation with a link to this entry. + */ + + /* Create a var for usage in upper relation */ + newvar = (Var *) copyObject(node); + + /* Does the var already exists in the target list? */ + tle = tlist_member_match_var(var, ctx->subquery->targetList); + + if (tle == NULL) + { + int resno = list_length(ctx->subquery->targetList) + 1; + + /* + * Target list of the subquery doesn't contain this var. Add it + * into the end of the target list and correct the link + * XXX: Maybe choose real colname here? + */ + tle = makeTargetEntry((Expr *) var, resno, "rescol", false); + ctx->subquery->targetList = lappend(ctx->subquery->targetList, + tle); + } + else + { + if (tle->resjunk) + { + /* + * Target entry exists but used as an utility entry + * (for grouping, as an example). So, revert its status to + * a full valued entry. + */ + tle->resjunk = false; + tle->resname = pstrdup("resjunkcol"); + } + } + + /* + * Set the new var to refer newly created RangeTblEntry in the upper + * query and varattno to refer at specific position in the target + * list. + */ + newvar->varno = ctx->newvarno; + newvar->varattno = tle->resno; + + return (Node *) newvar; + } + } + if (IsA(node, Query)) + return (Node *) query_tree_mutator((Query *) node, + pull_subquery_clauses_mutator, + (void *) ctx, 0); + + return expression_tree_mutator(node, pull_subquery_clauses_mutator, + (void *) ctx); +} + +static List * +pull_correlated_clauses(PlannerInfo *root, SubLink *sublink) +{ + Query *parse = root->parse; + Query *subselect = (Query *) sublink->subselect; + FromExpr *f; + correlated_t ctx = {.subquery = subselect, + .newvarno = list_length(parse->rtable) + 1, /* Looks like a hack */ + .pulling_quals = NIL, + .varlevel_up = false}; + + Assert(IsA(subselect, Query)); + + /* Use only for correlated candidates, just for optimal usage */ + Assert(contain_vars_of_level((Node *) subselect, 1)); + + if (!optimize_correlated_subqueries || + subselect->hasAggs || + subselect->hasWindowFuncs || + subselect->hasForUpdate || /* Pulling of clauses can change a number of tuples which subselect returns. */ + subselect->hasRowSecurity /* Just because of paranoid safety */ + ) + /* The feature is switched off. */ + return NULL; + + /* + * We pull up quals and arrange variable levels for expressions in WHERE + * section only. So, cut the optimization off if an upper relation links + * from another parts of the subquery are detected. + */ + if (contain_vars_of_level((Node *) subselect->cteList, 1) || + /* see comments in subselect.sql */ + contain_vars_of_level((Node *) subselect->rtable, 1) || + contain_vars_of_level((Node *) subselect->targetList, 1) || + contain_vars_of_level((Node *) subselect->returningList, 1) || + contain_vars_of_level((Node *) subselect->groupingSets, 1) || + contain_vars_of_level((Node *) subselect->distinctClause, 1) || + contain_vars_of_level((Node *) subselect->sortClause, 1) || + contain_vars_of_level((Node *) subselect->limitOffset, 1) || + contain_vars_of_level((Node *) subselect->limitCount, 1) || + contain_vars_of_level((Node *) subselect->rowMarks, 1) || + contain_vars_of_level((Node *) subselect->havingQual, 1) || + contain_vars_of_level((Node *) subselect->groupClause, 1)) + return NULL; + + f = subselect->jointree; + + if (!f || !f->quals || !quals_is_pullable(f->quals)) + /* Degenerate case */ + return NULL; + + /* + * Now, is proved that it is possible to pull up expressions with variables + * from the upper query. + * Pull up quals, containing correlated expressions. Replace its + * positions with a true boolean expression. + * It would be removed on a next planning stage. + */ + f->quals = pull_subquery_clauses_mutator(f->quals, (void *) &ctx); + + return ctx.pulling_quals; +} /* * convert_ANY_sublink_to_join: try to convert an ANY SubLink to a join @@ -1279,16 +1553,10 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, List *subquery_vars; Node *quals; ParseState *pstate; + List *pclauses = NIL; Assert(sublink->subLinkType == ANY_SUBLINK); - /* - * The sub-select must not refer to any Vars of the parent query. (Vars of - * higher levels should be okay, though.) - */ - if (contain_vars_of_level((Node *) subselect, 1)) - return NULL; - /* * The test expression must contain some Vars of the parent query, else * it's not gonna be a join. (Note that it won't have Vars referring to @@ -1310,6 +1578,17 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, if (contain_volatile_functions(sublink->testexpr)) return NULL; + /* + * The sub-select must not refer to any Vars of the parent query. (Vars of + * higher levels should be okay, though.) + * In the case of correlated subquery, jointree quals structure will be + * modified: expressions with variables from upper query moves to the + * pulled_clauses list, their places in the quals replaces by "true" value. + */ + if (contain_vars_of_level((Node *) subselect, 1) && + (pclauses = pull_correlated_clauses(root, sublink)) == NIL) + return NULL; + /* Create a dummy ParseState for addRangeTableEntryForSubquery */ pstate = make_parsestate(NULL); @@ -1348,6 +1627,20 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, */ quals = convert_testexpr(root, sublink->testexpr, subquery_vars); + /* Nested subquery with references to upper level relation. */ + if (pclauses != NIL) + { + /* Add clauses, pulled from subquery into WHERE section of the parent. */ + if (IsA(quals, BoolExpr)) + { + BoolExpr *b = (BoolExpr *) quals; + b->args = list_concat(b->args, pclauses); + } + else + quals = (Node *) make_andclause( + list_concat(list_make1(quals), pclauses)); + } + /* * And finally, build the JoinExpr node. */ diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c index 784a1af82d..5b7aee121f 100644 --- a/src/backend/optimizer/util/tlist.c +++ b/src/backend/optimizer/util/tlist.c @@ -98,7 +98,7 @@ tlist_member(Expr *node, List *targetlist) * This is needed in some cases where we can't be sure of an exact typmod * match. For safety, though, we insist on vartype match. */ -static TargetEntry * +TargetEntry * tlist_member_match_var(Var *var, List *targetlist) { ListCell *temp; diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c index 7db86c39ef..54441e692b 100644 --- a/src/backend/optimizer/util/var.c +++ b/src/backend/optimizer/util/var.c @@ -461,6 +461,14 @@ contain_vars_of_level_walker(Node *node, int *sublevels_up) return true; /* abort the tree traversal and return true */ /* else fall through to check the contained expr */ } + if (IsA(node, RangeTblEntry)) + { + RangeTblEntry *rte = (RangeTblEntry *) node; + + /* Someone can call the routine on a field of Query struct */ + return range_table_entry_walker(rte, contain_vars_of_level_walker, + (void *) sublevels_up, 0); + } if (IsA(node, Query)) { /* Recurse into subselects */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 9fbbfb1be5..a2aac52a4b 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1024,6 +1024,16 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"optimize_correlated_subqueries", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("optimize_correlated_subqueries."), + NULL, + GUC_EXPLAIN | GUC_NOT_IN_SAMPLE + }, + &optimize_correlated_subqueries, + true, + NULL, NULL, NULL + }, { {"enable_indexscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of index-scan plans."), diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index 409005bae9..cdf3fdce1a 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -88,6 +88,7 @@ extern PGDLLIMPORT double parallel_tuple_cost; extern PGDLLIMPORT double parallel_setup_cost; extern PGDLLIMPORT double recursive_worktable_factor; extern PGDLLIMPORT int effective_cache_size; +extern PGDLLIMPORT bool optimize_correlated_subqueries; extern double clamp_row_est(double nrows); extern long clamp_cardinality_to_long(Cardinality x); diff --git a/src/include/optimizer/tlist.h b/src/include/optimizer/tlist.h index 04668ba1c0..7627e7f679 100644 --- a/src/include/optimizer/tlist.h +++ b/src/include/optimizer/tlist.h @@ -18,6 +18,7 @@ extern TargetEntry *tlist_member(Expr *node, List *targetlist); +extern TargetEntry *tlist_member_match_var(Var *var, List *targetlist); extern List *add_to_flat_tlist(List *tlist, List *exprs); diff --git a/src/test/regress/expected/prepare.out b/src/test/regress/expected/prepare.out index 5815e17b39..749b3faf64 100644 --- a/src/test/regress/expected/prepare.out +++ b/src/test/regress/expected/prepare.out @@ -184,6 +184,24 @@ SELECT name, statement, parameter_types, result_types FROM pg_prepared_statement | UPDATE tenk1 SET stringu1 = $2 WHERE unique1 = $1; | | (6 rows) +-- The optimization on unnesting of correlated subqueries should work +PREPARE q9(name,int) AS + SELECT * FROM tenk1 upper WHERE unique1 IN ( + SELECT sub.unique2 FROM tenk1 sub + WHERE sub.stringu1 = $1 AND sub.unique1 = upper.unique2 + $2); +EXPLAIN (COSTS OFF) EXECUTE q9('abc',2); + QUERY PLAN +----------------------------------------------------- + Nested Loop + -> HashAggregate + Group Key: sub.unique2, sub.unique1 + -> Seq Scan on tenk1 sub + Filter: (stringu1 = 'abc'::name) + -> Index Scan using tenk1_unique1 on tenk1 upper + Index Cond: (unique1 = sub.unique2) + Filter: (sub.unique1 = (unique2 + 2)) +(8 rows) + -- test DEALLOCATE ALL; DEALLOCATE ALL; SELECT name, statement, parameter_types FROM pg_prepared_statements diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 63d26d44fc..7caef83412 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -164,6 +164,35 @@ SELECT f1 AS "Correlated Field", f2 AS "Second Field" 3 | 3 (6 rows) +EXPLAIN (COSTS OFF) SELECT f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN + (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3); + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Hash Join + Hash Cond: ((upper.f1 = subselect_tbl.f2) AND ((upper.f2)::double precision = subselect_tbl.f3)) + -> Seq Scan on subselect_tbl upper + -> Hash + -> HashAggregate + Group Key: subselect_tbl.f2, subselect_tbl.f3 + -> Seq Scan on subselect_tbl +(7 rows) + +-- Still doesn't work for NOT IN +EXPLAIN (COSTS OFF) SELECT f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 NOT IN + (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3); + QUERY PLAN +------------------------------------------------------- + Seq Scan on subselect_tbl upper + Filter: (NOT (SubPlan 1)) + SubPlan 1 + -> Seq Scan on subselect_tbl + Filter: ((upper.f2)::double precision = f3) +(5 rows) + SELECT f1 AS "Correlated Field", f3 AS "Second Field" FROM SUBSELECT_TBL upper WHERE f1 IN @@ -177,6 +206,346 @@ SELECT f1 AS "Correlated Field", f3 AS "Second Field" 3 | 3 (5 rows) +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN + (SELECT b.f2 FROM subselect_tbl b WHERE b.f2 = a.f1) +; -- Optimizer removes excess clause + QUERY PLAN +----------------------------------------------- + Hash Join + Hash Cond: (a.f1 = b.f2) + -> Seq Scan on subselect_tbl a + -> Hash + -> HashAggregate + Group Key: b.f2, b.f2 + -> Seq Scan on subselect_tbl b +(7 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE a.f1 IN (SELECT b.f2*b.f1/b.f3+2 FROM subselect_tbl b WHERE b.f3 = a.f2) +; -- a bit more complex targetlist expression shouldn't cut off the optimization + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------- + Hash Join + Hash Cond: (((a.f1)::double precision = ((((b.f2 * b.f1))::double precision / b.f3) + '2'::double precision)) AND ((a.f2)::double precision = b.f3)) + -> Seq Scan on subselect_tbl a + -> Hash + -> HashAggregate + Group Key: ((((b.f2 * b.f1))::double precision / b.f3) + '2'::double precision), b.f3 + -> Seq Scan on subselect_tbl b +(7 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE (a.f1,a.f3) IN (SELECT b.f2,b.f1 FROM subselect_tbl b WHERE b.f3 = a.f2) +; -- Two variables in a target list + QUERY PLAN +---------------------------------------------------------------------------------------------------------- + Hash Join + Hash Cond: ((a.f1 = b.f2) AND (a.f3 = (b.f1)::double precision) AND ((a.f2)::double precision = b.f3)) + -> Seq Scan on subselect_tbl a + -> Hash + -> HashAggregate + Group Key: b.f2, (b.f1)::double precision, b.f3 + -> Seq Scan on subselect_tbl b +(7 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE (a.f1,a.f3) IN (SELECT b.f2,b.f1*2 FROM subselect_tbl b WHERE b.f3 = a.f2) +; -- Expression as an element of composite type shouldn't cut off the optimization + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Hash Join + Hash Cond: ((a.f1 = b.f2) AND (a.f3 = ((b.f1 * 2))::double precision) AND ((a.f2)::double precision = b.f3)) + -> Seq Scan on subselect_tbl a + -> Hash + -> HashAggregate + Group Key: b.f2, ((b.f1 * 2))::double precision, b.f3 + -> Seq Scan on subselect_tbl b +(7 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE a.f1 IN (SELECT b.f2 FROM subselect_tbl b WHERE b.f3 = a.f2 AND a.f1 = b.f3 AND b.f3 <> 12) +; -- Two expressions with correlated variables + QUERY PLAN +------------------------------------------------------------------------- + Hash Join + Hash Cond: ((b.f2 = a.f1) AND (b.f3 = (a.f2)::double precision)) + -> HashAggregate + Group Key: b.f2, b.f3, b.f3 + -> Seq Scan on subselect_tbl b + Filter: (f3 <> '12'::double precision) + -> Hash + -> Seq Scan on subselect_tbl a + Filter: ((f2)::double precision = (f1)::double precision) +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE a.f1 IN (SELECT b.f2 FROM subselect_tbl b WHERE b.f3 = a.f2 AND a.f2 = b.f3 AND b.f1 < 12) +; -- Two expressions with correlated variables relates on one upper variable. + QUERY PLAN +-------------------------------------------------------------------- + Hash Join + Hash Cond: ((a.f1 = b.f2) AND ((a.f2)::double precision = b.f3)) + -> Seq Scan on subselect_tbl a + -> Hash + -> HashAggregate + Group Key: b.f2, b.f3, b.f3 + -> Seq Scan on subselect_tbl b + Filter: (f1 < 12) +(8 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f2 FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + GROUP BY (b.f2) + ) +; -- Pull clauses without unnesting the query. XXX: It reduces performance in most use cases, doesn't it? + QUERY PLAN +----------------------------------------- + Hash Join + Hash Cond: (b.f2 = a.f1) + -> HashAggregate + Group Key: b.f2 + -> Seq Scan on subselect_tbl b + Filter: (f2 < 12) + -> Hash + -> Seq Scan on subselect_tbl a + Filter: (f1 = f2) +(9 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f2 FROM ( + SELECT x AS f1, x+1 AS f2, x+2 AS f3 FROM generate_series(1,10) x + ) b WHERE b.f2 = a.f2 AND b.f1 BETWEEN 12 AND 14 + UNION ALL + SELECT c.f1 FROM subselect_tbl c + WHERE c.f2 = a.f2 AND c.f1 BETWEEN 12 AND 14 + ) +; -- Disallow flattening of union all + QUERY PLAN +------------------------------------------------------------------------ + Seq Scan on subselect_tbl a + Filter: (SubPlan 1) + SubPlan 1 + -> Append + -> Function Scan on generate_series x + Filter: ((x >= 12) AND (x <= 14) AND ((x + 1) = a.f2)) + -> Seq Scan on subselect_tbl c + Filter: ((f1 >= 12) AND (f1 <= 14) AND (f2 = a.f2)) +(8 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f1 FROM subselect_tbl b JOIN subselect_tbl c ON (b.f1 = c.f2) + WHERE c.f2 = a.f2 AND c.f1 BETWEEN 12 AND 14 + ) +; -- XXX: Could we flatten such subquery? + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on subselect_tbl a + Filter: (SubPlan 1) + SubPlan 1 + -> Nested Loop + -> Seq Scan on subselect_tbl c + Filter: ((f1 >= 12) AND (f1 <= 14) AND (f2 = a.f2)) + -> Seq Scan on subselect_tbl b + Filter: (f1 = a.f2) +(8 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f1 FROM subselect_tbl b, subselect_tbl c + WHERE b.f1 = c.f2 AND c.f2 = a.f2 AND c.f1 IS NOT NULL + ) +; -- TODO: Could we flatten such subquery? + QUERY PLAN +------------------------------------------------------------------ + Seq Scan on subselect_tbl a + Filter: (SubPlan 1) + SubPlan 1 + -> Nested Loop + -> Seq Scan on subselect_tbl b + Filter: (f1 = a.f2) + -> Materialize + -> Seq Scan on subselect_tbl c + Filter: ((f1 IS NOT NULL) AND (f2 = a.f2)) +(9 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE (a.f1,f2) IN ( + SELECT b.f2, avg(f3) FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + GROUP BY (b.f2) + ) +; -- Doesn't support unnesting with aggregate functions + QUERY PLAN +----------------------------------------------------- + Seq Scan on subselect_tbl a + Filter: (SubPlan 1) + SubPlan 1 + -> GroupAggregate + Group Key: b.f2 + -> Seq Scan on subselect_tbl b + Filter: ((f2 < 12) AND (f2 = a.f2)) +(7 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + WITH cte AS ( + SELECT * FROM subselect_tbl c WHERE f1 < 42 AND f2 = a.f1 + ) + SELECT b.f2 FROM cte b WHERE b.f2 = a.f2 AND b.f2 < 12 + ) +; -- Give up optimization if CTE in subquery contains links to upper relation. + QUERY PLAN +------------------------------------------------------------------- + Seq Scan on subselect_tbl a + Filter: (SubPlan 1) + SubPlan 1 + -> Result + One-Time Filter: (a.f1 = a.f2) + -> Seq Scan on subselect_tbl c + Filter: ((f1 < 42) AND (f2 < 12) AND (f2 = a.f2)) +(7 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + WITH cte AS ( + SELECT * FROM subselect_tbl c WHERE f1 < 42 + ) + SELECT b.f2 FROM cte b WHERE b.f2 = a.f2 AND b.f2 < 12 + ) +; -- Correlated subquery with trivial CTE can be pulled up + QUERY PLAN +------------------------------------------------- + Hash Semi Join + Hash Cond: (a.f1 = c.f2) + -> Seq Scan on subselect_tbl a + Filter: (f1 = f2) + -> Hash + -> Seq Scan on subselect_tbl c + Filter: ((f1 < 42) AND (f2 < 12)) +(7 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE (a.f1,a.f3) IN ( + SELECT b.f2, avg(b.f3) OVER (PARTITION BY b.f2) + FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + ) +; -- Doesn't support unnesting with window functions in target list + QUERY PLAN +----------------------------------------------------- + Seq Scan on subselect_tbl a + Filter: (SubPlan 1) + SubPlan 1 + -> WindowAgg + -> Seq Scan on subselect_tbl b + Filter: ((f2 < 12) AND (f2 = a.f2)) +(6 rows) + +-- A having qual, group clause and so on, with links to upper relation variable +-- cut off the optimization because another case we must rewrite the subquery +-- as a lateral TargetEntry and arrange these links. +-- But now, machinery of convert_ANY_sublink_to_join() isn't prepared for such +-- complex work and it would induce additional complex code. +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f2 + FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + GROUP BY (b.f2) HAVING b.f2 > a.f3 + ) +; + QUERY PLAN +----------------------------------------------------------------------------------------- + Seq Scan on subselect_tbl a + Filter: (SubPlan 1) + SubPlan 1 + -> Group + Group Key: b.f2 + -> Seq Scan on subselect_tbl b + Filter: ((f2 < 12) AND (f2 = a.f2) AND ((f2)::double precision > a.f3)) +(7 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f2 FROM ( + SELECT x AS f1, x+1 AS f2, x+2 AS f3 FROM generate_series(1,a.f1) x + ) b WHERE b.f2 = a.f2 AND b.f1 < 12 + ) +; -- Don't allow links to upper query in FROM section of subquery + QUERY PLAN +--------------------------------------------------- + Seq Scan on subselect_tbl a + Filter: (SubPlan 1) + SubPlan 1 + -> Function Scan on generate_series x + Filter: ((x < 12) AND ((x + 1) = a.f2)) +(5 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT a.f1 FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + GROUP BY (a.f1) + ) +; -- GROUP BY contains link to upper relation + QUERY PLAN +----------------------------------------------------- + Seq Scan on subselect_tbl a + Filter: (SubPlan 1) + SubPlan 1 + -> Group + Group Key: a.f1 + -> Seq Scan on subselect_tbl b + Filter: ((f2 < 12) AND (f2 = a.f2)) +(7 rows) + +-- Flatten subquery with not-correlated clauses. The same result set returned +EXPLAIN (COSTS OFF) SELECT f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN + (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3 AND f2 <> 42); + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Hash Join + Hash Cond: ((upper.f1 = subselect_tbl.f2) AND ((upper.f2)::double precision = subselect_tbl.f3)) + -> Seq Scan on subselect_tbl upper + -> Hash + -> HashAggregate + Group Key: subselect_tbl.f2, subselect_tbl.f3 + -> Seq Scan on subselect_tbl + Filter: (f2 <> 42) +(8 rows) + +SELECT f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN + (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3 AND f2 <> 42); + Correlated Field | Second Field +------------------+-------------- + 2 | 4 + 3 | 5 + 1 | 1 + 2 | 2 + 3 | 3 +(5 rows) + SELECT f1 AS "Correlated Field", f3 AS "Second Field" FROM SUBSELECT_TBL upper WHERE f3 IN (SELECT upper.f1 + f2 FROM SUBSELECT_TBL diff --git a/src/test/regress/sql/prepare.sql b/src/test/regress/sql/prepare.sql index c6098dc95c..8a8164ee99 100644 --- a/src/test/regress/sql/prepare.sql +++ b/src/test/regress/sql/prepare.sql @@ -78,6 +78,13 @@ PREPARE q8 AS SELECT name, statement, parameter_types, result_types FROM pg_prepared_statements ORDER BY name; +-- The optimization on unnesting of correlated subqueries should work +PREPARE q9(name,int) AS + SELECT * FROM tenk1 upper WHERE unique1 IN ( + SELECT sub.unique2 FROM tenk1 sub + WHERE sub.stringu1 = $1 AND sub.unique1 = upper.unique2 + $2); +EXPLAIN (COSTS OFF) EXECUTE q9('abc',2); + -- test DEALLOCATE ALL; DEALLOCATE ALL; SELECT name, statement, parameter_types FROM pg_prepared_statements diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 40276708c9..b29036e236 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -67,11 +67,151 @@ SELECT f1 AS "Correlated Field", f2 AS "Second Field" FROM SUBSELECT_TBL upper WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1); +EXPLAIN (COSTS OFF) SELECT f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN + (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3); +-- Still doesn't work for NOT IN +EXPLAIN (COSTS OFF) SELECT f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 NOT IN + (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3); + SELECT f1 AS "Correlated Field", f3 AS "Second Field" FROM SUBSELECT_TBL upper WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3); +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN + (SELECT b.f2 FROM subselect_tbl b WHERE b.f2 = a.f1) +; -- Optimizer removes excess clause +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE a.f1 IN (SELECT b.f2*b.f1/b.f3+2 FROM subselect_tbl b WHERE b.f3 = a.f2) +; -- a bit more complex targetlist expression shouldn't cut off the optimization +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE (a.f1,a.f3) IN (SELECT b.f2,b.f1 FROM subselect_tbl b WHERE b.f3 = a.f2) +; -- Two variables in a target list +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE (a.f1,a.f3) IN (SELECT b.f2,b.f1*2 FROM subselect_tbl b WHERE b.f3 = a.f2) +; -- Expression as an element of composite type shouldn't cut off the optimization +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE a.f1 IN (SELECT b.f2 FROM subselect_tbl b WHERE b.f3 = a.f2 AND a.f1 = b.f3 AND b.f3 <> 12) +; -- Two expressions with correlated variables +EXPLAIN (COSTS OFF) +SELECT * FROM subselect_tbl a +WHERE a.f1 IN (SELECT b.f2 FROM subselect_tbl b WHERE b.f3 = a.f2 AND a.f2 = b.f3 AND b.f1 < 12) +; -- Two expressions with correlated variables relates on one upper variable. +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f2 FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + GROUP BY (b.f2) + ) +; -- Pull clauses without unnesting the query. XXX: It reduces performance in most use cases, doesn't it? + +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f2 FROM ( + SELECT x AS f1, x+1 AS f2, x+2 AS f3 FROM generate_series(1,10) x + ) b WHERE b.f2 = a.f2 AND b.f1 BETWEEN 12 AND 14 + UNION ALL + SELECT c.f1 FROM subselect_tbl c + WHERE c.f2 = a.f2 AND c.f1 BETWEEN 12 AND 14 + ) +; -- Disallow flattening of union all +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f1 FROM subselect_tbl b JOIN subselect_tbl c ON (b.f1 = c.f2) + WHERE c.f2 = a.f2 AND c.f1 BETWEEN 12 AND 14 + ) +; -- XXX: Could we flatten such subquery? +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f1 FROM subselect_tbl b, subselect_tbl c + WHERE b.f1 = c.f2 AND c.f2 = a.f2 AND c.f1 IS NOT NULL + ) +; -- TODO: Could we flatten such subquery? +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE (a.f1,f2) IN ( + SELECT b.f2, avg(f3) FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + GROUP BY (b.f2) + ) +; -- Doesn't support unnesting with aggregate functions +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + WITH cte AS ( + SELECT * FROM subselect_tbl c WHERE f1 < 42 AND f2 = a.f1 + ) + SELECT b.f2 FROM cte b WHERE b.f2 = a.f2 AND b.f2 < 12 + ) +; -- Give up optimization if CTE in subquery contains links to upper relation. +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + WITH cte AS ( + SELECT * FROM subselect_tbl c WHERE f1 < 42 + ) + SELECT b.f2 FROM cte b WHERE b.f2 = a.f2 AND b.f2 < 12 + ) +; -- Correlated subquery with trivial CTE can be pulled up +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE (a.f1,a.f3) IN ( + SELECT b.f2, avg(b.f3) OVER (PARTITION BY b.f2) + FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + ) +; -- Doesn't support unnesting with window functions in target list + +-- A having qual, group clause and so on, with links to upper relation variable +-- cut off the optimization because another case we must rewrite the subquery +-- as a lateral TargetEntry and arrange these links. +-- But now, machinery of convert_ANY_sublink_to_join() isn't prepared for such +-- complex work and it would induce additional complex code. +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f2 + FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + GROUP BY (b.f2) HAVING b.f2 > a.f3 + ) +; +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT b.f2 FROM ( + SELECT x AS f1, x+1 AS f2, x+2 AS f3 FROM generate_series(1,a.f1) x + ) b WHERE b.f2 = a.f2 AND b.f1 < 12 + ) +; -- Don't allow links to upper query in FROM section of subquery +EXPLAIN (COSTS OFF) + SELECT * FROM subselect_tbl a + WHERE a.f1 IN ( + SELECT a.f1 FROM subselect_tbl b WHERE b.f2 = a.f2 AND b.f2 < 12 + GROUP BY (a.f1) + ) +; -- GROUP BY contains link to upper relation + +-- Flatten subquery with not-correlated clauses. The same result set returned +EXPLAIN (COSTS OFF) SELECT f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN + (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3 AND f2 <> 42); +SELECT f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN + (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3 AND f2 <> 42); + SELECT f1 AS "Correlated Field", f3 AS "Second Field" FROM SUBSELECT_TBL upper WHERE f3 IN (SELECT upper.f1 + f2 FROM SUBSELECT_TBL -- 2.34.1