From 17f99ea9ed8eb5321f87db1fb39dc577860f675c Mon Sep 17 00:00:00 2001
From: Richard Guo <guofenglinux@gmail.com>
Date: Sat, 25 Apr 2026 16:45:14 +0900
Subject: [PATCH v2 2/2] Consider collation when proving subquery uniqueness

rel_is_distinct_for()'s RTE_SUBQUERY branch passed only the equality
operator from each join clause to query_is_distinct_for(), discarding
the operator's input collation.  query_is_distinct_for() then verified
opfamily compatibility but never checked collations, so a DISTINCT /
GROUP BY / set-op operating under one collation was trusted to prove
uniqueness for a comparison performed under an unrelated collation.
As with the recent fix in relation_has_unique_index_for(), this is
unsound for nondeterministic collations and yields wrong query results
in any optimization that consumes the proof.

Fix by carrying each clause's operator input collation into
query_is_distinct_for() and validating it at every check-site against
the subquery target expression's collation.
---
 src/backend/optimizer/plan/analyzejoins.c     | 125 ++++++------
 src/include/nodes/pathnodes.h                 |  14 ++
 src/include/optimizer/planmain.h              |   2 +-
 .../regress/expected/collate.icu.utf8.out     | 181 ++++++++++++++++++
 src/test/regress/sql/collate.icu.utf8.sql     |  58 ++++++
 src/tools/pgindent/typedefs.list              |   1 +
 6 files changed, 326 insertions(+), 55 deletions(-)

diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 03056bdf3e0..95ec8b136d5 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -68,7 +68,7 @@ static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
 static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel);
 static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
 								List *clause_list, List **extra_clauses);
-static Oid	distinct_col_search(int colno, List *colnos, List *opids);
+static DistinctColInfo *distinct_col_search(int colno, List *distinct_cols);
 static bool is_innerrel_unique_for(PlannerInfo *root,
 								   Relids joinrelids,
 								   Relids outerrelids,
@@ -1032,15 +1032,17 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
 	{
 		Index		relid = rel->relid;
 		Query	   *subquery = root->simple_rte_array[relid]->subquery;
-		List	   *colnos = NIL;
-		List	   *opids = NIL;
+		List	   *distinct_cols = NIL;
 		ListCell   *l;
 
 		/*
-		 * Build the argument lists for query_is_distinct_for: a list of
-		 * output column numbers that the query needs to be distinct over, and
-		 * a list of equality operators that the output columns need to be
-		 * distinct according to.
+		 * Build the argument list for query_is_distinct_for: a list of
+		 * DistinctColInfo entries, each holding an output column number that
+		 * the query needs to be distinct over, the equality operator that the
+		 * column needs to be distinct according to, and that operator's input
+		 * collation.  The collation matters because the subquery's own
+		 * DISTINCT / GROUP BY / set-op proves uniqueness under its own
+		 * collation, which need not agree with the operator's.
 		 *
 		 * (XXX we are not considering restriction clauses attached to the
 		 * subquery; is that worth doing?)
@@ -1048,18 +1050,18 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
 		foreach(l, clause_list)
 		{
 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
-			Oid			op;
+			OpExpr	   *opexpr;
 			Var		   *var;
+			DistinctColInfo *dcinfo;
 
 			/*
-			 * Get the equality operator we need uniqueness according to.
-			 * (This might be a cross-type operator and thus not exactly the
-			 * same operator the subquery would consider; that's all right
-			 * since query_is_distinct_for can resolve such cases.)  The
-			 * caller's mergejoinability test should have selected only
-			 * OpExprs.
+			 * The caller's mergejoinability test should have selected only
+			 * OpExprs.  The operator might be a cross-type operator and thus
+			 * not exactly the same operator the subquery would consider;
+			 * that's all right since query_is_distinct_for can resolve such
+			 * cases.
 			 */
-			op = castNode(OpExpr, rinfo->clause)->opno;
+			opexpr = castNode(OpExpr, rinfo->clause);
 
 			/* caller identified the inner side for us */
 			if (rinfo->outer_is_left)
@@ -1083,11 +1085,14 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
 				var->varno != relid || var->varlevelsup != 0)
 				continue;
 
-			colnos = lappend_int(colnos, var->varattno);
-			opids = lappend_oid(opids, op);
+			dcinfo = palloc(sizeof(DistinctColInfo));
+			dcinfo->colno = var->varattno;
+			dcinfo->opid = opexpr->opno;
+			dcinfo->collid = opexpr->inputcollid;
+			distinct_cols = lappend(distinct_cols, dcinfo);
 		}
 
-		if (query_is_distinct_for(subquery, colnos, opids))
+		if (query_is_distinct_for(subquery, distinct_cols))
 			return true;
 	}
 	return false;
@@ -1131,25 +1136,32 @@ query_supports_distinctness(Query *query)
  * query is a not-yet-planned subquery (in current usage, it's always from
  * a subquery RTE, which the planner avoids scribbling on).
  *
- * colnos is an integer list of output column numbers (resno's).  We are
- * interested in whether rows consisting of just these columns are certain
- * to be distinct.  "Distinctness" is defined according to whether the
- * corresponding upper-level equality operators listed in opids would think
- * the values are distinct.  (Note: the opids entries could be cross-type
- * operators, and thus not exactly the equality operators that the subquery
- * would use itself.  We use equality_ops_are_compatible() to check
- * compatibility.  That looks at opfamily membership for index AMs that have
- * declared that they support consistent equality semantics within an
- * opfamily, and so should give trustworthy answers for all operators that we
- * might need to deal with here.)
+ * distinct_cols is a list of DistinctColInfo, one per requested output column.
+ * Each entry names the subquery output column number we want distinct, the
+ * upper-level equality operator we'll compare values with, and that operator's
+ * input collation.  We are interested in whether rows consisting of just these
+ * columns are certain to be distinct.
+ *
+ * "Distinctness" is defined according to whether the corresponding upper-level
+ * equality operators would think the values are distinct.  (Note: the opids
+ * entries could be cross-type operators, and thus not exactly the equality
+ * operators that the subquery would use itself.  We use
+ * equality_ops_are_compatible() to check compatibility.  That looks at
+ * opfamily membership for index AMs that have declared that they support
+ * consistent equality semantics within an opfamily, and so should give
+ * trustworthy answers for all operators that we might need to deal with here.)
+ *
+ * The collid must also be compatible with the collation the subquery's own
+ * DISTINCT/GROUP BY/set-op uses to deduplicate the column, else the subquery's
+ * distinctness does not carry over to the caller's equality semantics.
+ * Compatible means either the same collation or both deterministic (in which
+ * case both reduce equality to byte-equality; see CREATE COLLATION).
  */
 bool
-query_is_distinct_for(Query *query, List *colnos, List *opids)
+query_is_distinct_for(Query *query, List *distinct_cols)
 {
 	ListCell   *l;
-	Oid			opid;
-
-	Assert(list_length(colnos) == list_length(opids));
+	DistinctColInfo *dcinfo;
 
 	/*
 	 * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
@@ -1165,9 +1177,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 			TargetEntry *tle = get_sortgroupclause_tle(sgc,
 													   query->targetList);
 
-			opid = distinct_col_search(tle->resno, colnos, opids);
-			if (!OidIsValid(opid) ||
-				!equality_ops_are_compatible(opid, sgc->eqop))
+			dcinfo = distinct_col_search(tle->resno, distinct_cols);
+			if (dcinfo == NULL ||
+				!equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
+				!collations_are_compatible(dcinfo->collid,
+										   exprCollation((Node *) tle->expr)))
 				break;			/* exit early if no match */
 		}
 		if (l == NULL)			/* had matches for all? */
@@ -1196,9 +1210,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 			TargetEntry *tle = get_sortgroupclause_tle(sgc,
 													   query->targetList);
 
-			opid = distinct_col_search(tle->resno, colnos, opids);
-			if (!OidIsValid(opid) ||
-				!equality_ops_are_compatible(opid, sgc->eqop))
+			dcinfo = distinct_col_search(tle->resno, distinct_cols);
+			if (dcinfo == NULL ||
+				!equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
+				!collations_are_compatible(dcinfo->collid,
+										   exprCollation((Node *) tle->expr)))
 				break;			/* exit early if no match */
 		}
 		if (l == NULL)			/* had matches for all? */
@@ -1268,9 +1284,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 				sgc = (SortGroupClause *) lfirst(lg);
 				lg = lnext(topop->groupClauses, lg);
 
-				opid = distinct_col_search(tle->resno, colnos, opids);
-				if (!OidIsValid(opid) ||
-					!equality_ops_are_compatible(opid, sgc->eqop))
+				dcinfo = distinct_col_search(tle->resno, distinct_cols);
+				if (dcinfo == NULL ||
+					!equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
+					!collations_are_compatible(dcinfo->collid,
+											   exprCollation((Node *) tle->expr)))
 					break;		/* exit early if no match */
 			}
 			if (l == NULL)		/* had matches for all? */
@@ -1292,22 +1310,21 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 /*
  * distinct_col_search - subroutine for query_is_distinct_for
  *
- * If colno is in colnos, return the corresponding element of opids,
- * else return InvalidOid.  (Ordinarily colnos would not contain duplicates,
- * but if it does, we arbitrarily select the first match.)
+ * If colno matches the colno field of an entry in distinct_cols, return a
+ * pointer to that entry; else return NULL.  (Ordinarily distinct_cols would
+ * not contain duplicate colnos, but if it does, we arbitrarily select the
+ * first match.)
  */
-static Oid
-distinct_col_search(int colno, List *colnos, List *opids)
+static DistinctColInfo *
+distinct_col_search(int colno, List *distinct_cols)
 {
-	ListCell   *lc1,
-			   *lc2;
-
-	forboth(lc1, colnos, lc2, opids)
+	foreach_ptr(DistinctColInfo, dcinfo, distinct_cols)
 	{
-		if (colno == lfirst_int(lc1))
-			return lfirst_oid(lc2);
+		if (dcinfo->colno == colno)
+			return dcinfo;
 	}
-	return InvalidOid;
+
+	return NULL;
 }
 
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 693b879f76d..27a2c6815b7 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -3375,6 +3375,20 @@ typedef struct RowIdentityVarInfo
 	Relids		rowidrels;		/* RTE indexes of target rels using this */
 } RowIdentityVarInfo;
 
+/*
+ * One element of the list passed to query_is_distinct_for().  Each entry
+ * names a subquery output column that the caller needs to be distinct over,
+ * plus the upper-level equality operator and its input collation, so that
+ * the subquery's own DISTINCT/GROUP BY/set-op clauses can be compared for
+ * compatibility.
+ */
+typedef struct DistinctColInfo
+{
+	int			colno;			/* subquery output column resno */
+	Oid			opid;			/* upper-level equality operator */
+	Oid			collid;			/* input collation of opid */
+} DistinctColInfo;
+
 /*
  * For each distinct placeholder expression generated during planning, we
  * store a PlaceHolderInfo node in the PlannerInfo node's placeholder_list.
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index d0dc3761b13..71c043a25e8 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -111,7 +111,7 @@ extern void match_foreign_keys_to_quals(PlannerInfo *root);
 extern List *remove_useless_joins(PlannerInfo *root, List *joinlist);
 extern void reduce_unique_semijoins(PlannerInfo *root);
 extern bool query_supports_distinctness(Query *query);
-extern bool query_is_distinct_for(Query *query, List *colnos, List *opids);
+extern bool query_is_distinct_for(Query *query, List *distinct_cols);
 extern bool innerrel_is_unique(PlannerInfo *root,
 							   Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel,
 							   JoinType jointype, List *restrictlist, bool force_cache);
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index db87062b4a3..46d4917e77c 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1777,6 +1777,187 @@ ORDER BY 1;
  ghi
 (4 rows)
 
+--
+-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness
+-- under a different collation, so the planner must not use such a proof for
+-- any optimization.
+--
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Output: t1.x, test3cs.x
+   Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, test3cs.x
+         Hash Cond: ((t1.x)::text = (test3cs.x)::text)
+         ->  Seq Scan on collate_tests.test1cs t1
+               Output: t1.x
+         ->  Hash
+               Output: test3cs.x
+               ->  HashAggregate
+                     Output: test3cs.x
+                     Group Key: test3cs.x
+                     ->  Seq Scan on collate_tests.test3cs
+                           Output: test3cs.x
+(15 rows)
+
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Same with GROUP BY
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Output: t1.x, test3cs.x
+   Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, test3cs.x
+         Hash Cond: ((t1.x)::text = (test3cs.x)::text)
+         ->  Seq Scan on collate_tests.test1cs t1
+               Output: t1.x
+         ->  Hash
+               Output: test3cs.x
+               ->  HashAggregate
+                     Output: test3cs.x
+                     Group Key: test3cs.x
+                     ->  Seq Scan on collate_tests.test3cs
+                           Output: test3cs.x
+(15 rows)
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Same with set-op
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Output: t1.x, test3cs.x
+   Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, test3cs.x
+         Hash Cond: ((test3cs.x)::text = (t1.x)::text)
+         ->  HashAggregate
+               Output: test3cs.x
+               Group Key: test3cs.x
+               ->  Append
+                     ->  Seq Scan on collate_tests.test3cs
+                           Output: test3cs.x
+                     ->  Seq Scan on collate_tests.test3cs test3cs_1
+                           Output: test3cs_1.x
+         ->  Hash
+               Output: t1.x
+               ->  Seq Scan on collate_tests.test1cs t1
+                     Output: t1.x
+(18 rows)
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Left Join
+         Hash Cond: (t1.x = (test3cs.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  HashAggregate
+                     Group Key: test3cs.x
+                     ->  Seq Scan on test3cs
+(9 rows)
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+  x  
+-----
+ abc
+ abc
+ ABC
+ ABC
+ def
+ ghi
+(6 rows)
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Semi Join
+         Hash Cond: ((t1.x)::text = (test3cs.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  HashAggregate
+                     Group Key: test3cs.x
+                     ->  Seq Scan on test3cs
+(9 rows)
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+  x  
+-----
+ abc
+ ABC
+ def
+ ghi
+(4 rows)
+
 CREATE TABLE test1ci (x text COLLATE case_insensitive);
 CREATE TABLE test2ci (x text COLLATE case_insensitive);
 CREATE TABLE test3ci (x text COLLATE case_insensitive);
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 95c16859afa..8179b1436f5 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -657,6 +657,64 @@ SELECT * FROM test3cs t1
   WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
 ORDER BY 1;
 
+--
+-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness
+-- under a different collation, so the planner must not use such a proof for
+-- any optimization.
+--
+
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Same with GROUP BY
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Same with set-op
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
 CREATE TABLE test1ci (x text COLLATE case_insensitive);
 CREATE TABLE test2ci (x text COLLATE case_insensitive);
 CREATE TABLE test3ci (x text COLLATE case_insensitive);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 9f1dd55213d..e08cfb7b61b 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -688,6 +688,7 @@ DiscardMode
 DiscardStmt
 DispatchOption
 DistanceValue
+DistinctColInfo
 DistinctExpr
 DoState
 DoStmt
-- 
2.39.5 (Apple Git-154)