From 17f99ea9ed8eb5321f87db1fb39dc577860f675c Mon Sep 17 00:00:00 2001 From: Richard Guo Date: Sat, 25 Apr 2026 16:45:14 +0900 Subject: [PATCH v2 2/2] Consider collation when proving subquery uniqueness rel_is_distinct_for()'s RTE_SUBQUERY branch passed only the equality operator from each join clause to query_is_distinct_for(), discarding the operator's input collation. query_is_distinct_for() then verified opfamily compatibility but never checked collations, so a DISTINCT / GROUP BY / set-op operating under one collation was trusted to prove uniqueness for a comparison performed under an unrelated collation. As with the recent fix in relation_has_unique_index_for(), this is unsound for nondeterministic collations and yields wrong query results in any optimization that consumes the proof. Fix by carrying each clause's operator input collation into query_is_distinct_for() and validating it at every check-site against the subquery target expression's collation. --- src/backend/optimizer/plan/analyzejoins.c | 125 ++++++------ src/include/nodes/pathnodes.h | 14 ++ src/include/optimizer/planmain.h | 2 +- .../regress/expected/collate.icu.utf8.out | 181 ++++++++++++++++++ src/test/regress/sql/collate.icu.utf8.sql | 58 ++++++ src/tools/pgindent/typedefs.list | 1 + 6 files changed, 326 insertions(+), 55 deletions(-) diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 03056bdf3e0..95ec8b136d5 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -68,7 +68,7 @@ static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved); static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel); static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, List **extra_clauses); -static Oid distinct_col_search(int colno, List *colnos, List *opids); +static DistinctColInfo *distinct_col_search(int colno, List *distinct_cols); static bool is_innerrel_unique_for(PlannerInfo *root, Relids joinrelids, Relids outerrelids, @@ -1032,15 +1032,17 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, { Index relid = rel->relid; Query *subquery = root->simple_rte_array[relid]->subquery; - List *colnos = NIL; - List *opids = NIL; + List *distinct_cols = NIL; ListCell *l; /* - * Build the argument lists for query_is_distinct_for: a list of - * output column numbers that the query needs to be distinct over, and - * a list of equality operators that the output columns need to be - * distinct according to. + * Build the argument list for query_is_distinct_for: a list of + * DistinctColInfo entries, each holding an output column number that + * the query needs to be distinct over, the equality operator that the + * column needs to be distinct according to, and that operator's input + * collation. The collation matters because the subquery's own + * DISTINCT / GROUP BY / set-op proves uniqueness under its own + * collation, which need not agree with the operator's. * * (XXX we are not considering restriction clauses attached to the * subquery; is that worth doing?) @@ -1048,18 +1050,18 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, foreach(l, clause_list) { RestrictInfo *rinfo = lfirst_node(RestrictInfo, l); - Oid op; + OpExpr *opexpr; Var *var; + DistinctColInfo *dcinfo; /* - * Get the equality operator we need uniqueness according to. - * (This might be a cross-type operator and thus not exactly the - * same operator the subquery would consider; that's all right - * since query_is_distinct_for can resolve such cases.) The - * caller's mergejoinability test should have selected only - * OpExprs. + * The caller's mergejoinability test should have selected only + * OpExprs. The operator might be a cross-type operator and thus + * not exactly the same operator the subquery would consider; + * that's all right since query_is_distinct_for can resolve such + * cases. */ - op = castNode(OpExpr, rinfo->clause)->opno; + opexpr = castNode(OpExpr, rinfo->clause); /* caller identified the inner side for us */ if (rinfo->outer_is_left) @@ -1083,11 +1085,14 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, var->varno != relid || var->varlevelsup != 0) continue; - colnos = lappend_int(colnos, var->varattno); - opids = lappend_oid(opids, op); + dcinfo = palloc(sizeof(DistinctColInfo)); + dcinfo->colno = var->varattno; + dcinfo->opid = opexpr->opno; + dcinfo->collid = opexpr->inputcollid; + distinct_cols = lappend(distinct_cols, dcinfo); } - if (query_is_distinct_for(subquery, colnos, opids)) + if (query_is_distinct_for(subquery, distinct_cols)) return true; } return false; @@ -1131,25 +1136,32 @@ query_supports_distinctness(Query *query) * query is a not-yet-planned subquery (in current usage, it's always from * a subquery RTE, which the planner avoids scribbling on). * - * colnos is an integer list of output column numbers (resno's). We are - * interested in whether rows consisting of just these columns are certain - * to be distinct. "Distinctness" is defined according to whether the - * corresponding upper-level equality operators listed in opids would think - * the values are distinct. (Note: the opids entries could be cross-type - * operators, and thus not exactly the equality operators that the subquery - * would use itself. We use equality_ops_are_compatible() to check - * compatibility. That looks at opfamily membership for index AMs that have - * declared that they support consistent equality semantics within an - * opfamily, and so should give trustworthy answers for all operators that we - * might need to deal with here.) + * distinct_cols is a list of DistinctColInfo, one per requested output column. + * Each entry names the subquery output column number we want distinct, the + * upper-level equality operator we'll compare values with, and that operator's + * input collation. We are interested in whether rows consisting of just these + * columns are certain to be distinct. + * + * "Distinctness" is defined according to whether the corresponding upper-level + * equality operators would think the values are distinct. (Note: the opids + * entries could be cross-type operators, and thus not exactly the equality + * operators that the subquery would use itself. We use + * equality_ops_are_compatible() to check compatibility. That looks at + * opfamily membership for index AMs that have declared that they support + * consistent equality semantics within an opfamily, and so should give + * trustworthy answers for all operators that we might need to deal with here.) + * + * The collid must also be compatible with the collation the subquery's own + * DISTINCT/GROUP BY/set-op uses to deduplicate the column, else the subquery's + * distinctness does not carry over to the caller's equality semantics. + * Compatible means either the same collation or both deterministic (in which + * case both reduce equality to byte-equality; see CREATE COLLATION). */ bool -query_is_distinct_for(Query *query, List *colnos, List *opids) +query_is_distinct_for(Query *query, List *distinct_cols) { ListCell *l; - Oid opid; - - Assert(list_length(colnos) == list_length(opids)); + DistinctColInfo *dcinfo; /* * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the @@ -1165,9 +1177,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) TargetEntry *tle = get_sortgroupclause_tle(sgc, query->targetList); - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) + dcinfo = distinct_col_search(tle->resno, distinct_cols); + if (dcinfo == NULL || + !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) || + !collations_are_compatible(dcinfo->collid, + exprCollation((Node *) tle->expr))) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ @@ -1196,9 +1210,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) TargetEntry *tle = get_sortgroupclause_tle(sgc, query->targetList); - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) + dcinfo = distinct_col_search(tle->resno, distinct_cols); + if (dcinfo == NULL || + !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) || + !collations_are_compatible(dcinfo->collid, + exprCollation((Node *) tle->expr))) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ @@ -1268,9 +1284,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) sgc = (SortGroupClause *) lfirst(lg); lg = lnext(topop->groupClauses, lg); - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) + dcinfo = distinct_col_search(tle->resno, distinct_cols); + if (dcinfo == NULL || + !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) || + !collations_are_compatible(dcinfo->collid, + exprCollation((Node *) tle->expr))) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ @@ -1292,22 +1310,21 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) /* * distinct_col_search - subroutine for query_is_distinct_for * - * If colno is in colnos, return the corresponding element of opids, - * else return InvalidOid. (Ordinarily colnos would not contain duplicates, - * but if it does, we arbitrarily select the first match.) + * If colno matches the colno field of an entry in distinct_cols, return a + * pointer to that entry; else return NULL. (Ordinarily distinct_cols would + * not contain duplicate colnos, but if it does, we arbitrarily select the + * first match.) */ -static Oid -distinct_col_search(int colno, List *colnos, List *opids) +static DistinctColInfo * +distinct_col_search(int colno, List *distinct_cols) { - ListCell *lc1, - *lc2; - - forboth(lc1, colnos, lc2, opids) + foreach_ptr(DistinctColInfo, dcinfo, distinct_cols) { - if (colno == lfirst_int(lc1)) - return lfirst_oid(lc2); + if (dcinfo->colno == colno) + return dcinfo; } - return InvalidOid; + + return NULL; } diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 693b879f76d..27a2c6815b7 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -3375,6 +3375,20 @@ typedef struct RowIdentityVarInfo Relids rowidrels; /* RTE indexes of target rels using this */ } RowIdentityVarInfo; +/* + * One element of the list passed to query_is_distinct_for(). Each entry + * names a subquery output column that the caller needs to be distinct over, + * plus the upper-level equality operator and its input collation, so that + * the subquery's own DISTINCT/GROUP BY/set-op clauses can be compared for + * compatibility. + */ +typedef struct DistinctColInfo +{ + int colno; /* subquery output column resno */ + Oid opid; /* upper-level equality operator */ + Oid collid; /* input collation of opid */ +} DistinctColInfo; + /* * For each distinct placeholder expression generated during planning, we * store a PlaceHolderInfo node in the PlannerInfo node's placeholder_list. diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index d0dc3761b13..71c043a25e8 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -111,7 +111,7 @@ extern void match_foreign_keys_to_quals(PlannerInfo *root); extern List *remove_useless_joins(PlannerInfo *root, List *joinlist); extern void reduce_unique_semijoins(PlannerInfo *root); extern bool query_supports_distinctness(Query *query); -extern bool query_is_distinct_for(Query *query, List *colnos, List *opids); +extern bool query_is_distinct_for(Query *query, List *distinct_cols); extern bool innerrel_is_unique(PlannerInfo *root, Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel, JoinType jointype, List *restrictlist, bool force_cache); diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index db87062b4a3..46d4917e77c 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1777,6 +1777,187 @@ ORDER BY 1; ghi (4 rows) +-- +-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness +-- under a different collation, so the planner must not use such a proof for +-- any optimization. +-- +-- Ensure that we do not use inner-unique join execution +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Output: t1.x, test3cs.x + Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive + -> Hash Join + Output: t1.x, test3cs.x + Hash Cond: ((t1.x)::text = (test3cs.x)::text) + -> Seq Scan on collate_tests.test1cs t1 + Output: t1.x + -> Hash + Output: test3cs.x + -> HashAggregate + Output: test3cs.x + Group Key: test3cs.x + -> Seq Scan on collate_tests.test3cs + Output: test3cs.x +(15 rows) + +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + x | x +-----+----- + abc | abc + abc | ABC + ABC | abc + ABC | ABC + def | def + ghi | ghi +(6 rows) + +-- Same with GROUP BY +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Output: t1.x, test3cs.x + Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive + -> Hash Join + Output: t1.x, test3cs.x + Hash Cond: ((t1.x)::text = (test3cs.x)::text) + -> Seq Scan on collate_tests.test1cs t1 + Output: t1.x + -> Hash + Output: test3cs.x + -> HashAggregate + Output: test3cs.x + Group Key: test3cs.x + -> Seq Scan on collate_tests.test3cs + Output: test3cs.x +(15 rows) + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + x | x +-----+----- + abc | abc + abc | ABC + ABC | abc + ABC | ABC + def | def + ghi | ghi +(6 rows) + +-- Same with set-op +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Output: t1.x, test3cs.x + Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive + -> Hash Join + Output: t1.x, test3cs.x + Hash Cond: ((test3cs.x)::text = (t1.x)::text) + -> HashAggregate + Output: test3cs.x + Group Key: test3cs.x + -> Append + -> Seq Scan on collate_tests.test3cs + Output: test3cs.x + -> Seq Scan on collate_tests.test3cs test3cs_1 + Output: test3cs_1.x + -> Hash + Output: t1.x + -> Seq Scan on collate_tests.test1cs t1 + Output: t1.x +(18 rows) + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + x | x +-----+----- + abc | abc + abc | ABC + ABC | abc + ABC | ABC + def | def + ghi | ghi +(6 rows) + +-- Ensure that left-join is not removed +EXPLAIN (COSTS OFF) +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + QUERY PLAN +----------------------------------------------- + Sort + Sort Key: t1.x COLLATE case_sensitive + -> Hash Left Join + Hash Cond: (t1.x = (test3cs.x)::text) + -> Seq Scan on test3cs t1 + -> Hash + -> HashAggregate + Group Key: test3cs.x + -> Seq Scan on test3cs +(9 rows) + +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + x +----- + abc + abc + ABC + ABC + def + ghi +(6 rows) + +-- Ensure that semijoin is not reduced to innerjoin +EXPLAIN (COSTS OFF) +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + QUERY PLAN +------------------------------------------------------- + Sort + Sort Key: t1.x COLLATE case_sensitive + -> Hash Semi Join + Hash Cond: ((t1.x)::text = (test3cs.x)::text) + -> Seq Scan on test3cs t1 + -> Hash + -> HashAggregate + Group Key: test3cs.x + -> Seq Scan on test3cs +(9 rows) + +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + x +----- + abc + ABC + def + ghi +(4 rows) + CREATE TABLE test1ci (x text COLLATE case_insensitive); CREATE TABLE test2ci (x text COLLATE case_insensitive); CREATE TABLE test3ci (x text COLLATE case_insensitive); diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 95c16859afa..8179b1436f5 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -657,6 +657,64 @@ SELECT * FROM test3cs t1 WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive) ORDER BY 1; +-- +-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness +-- under a different collation, so the planner must not use such a proof for +-- any optimization. +-- + +-- Ensure that we do not use inner-unique join execution +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +-- Same with GROUP BY +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +-- Same with set-op +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +-- Ensure that left-join is not removed +EXPLAIN (COSTS OFF) +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + +-- Ensure that semijoin is not reduced to innerjoin +EXPLAIN (COSTS OFF) +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + CREATE TABLE test1ci (x text COLLATE case_insensitive); CREATE TABLE test2ci (x text COLLATE case_insensitive); CREATE TABLE test3ci (x text COLLATE case_insensitive); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9f1dd55213d..e08cfb7b61b 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -688,6 +688,7 @@ DiscardMode DiscardStmt DispatchOption DistanceValue +DistinctColInfo DistinctExpr DoState DoStmt -- 2.39.5 (Apple Git-154)