From 9449c09688d542c4dc201ee866f67d67304cff98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B8=80=E6=8C=83?= <yizhi.fzh@alibaba-inc.com>
Date: Wed, 4 Mar 2020 14:33:56 +0800
Subject: [PATCH v2] [PATCH] Erase the distinct path if the result is unique by
 catalog

For a single relation, we can tell it by any one of the following
is true:
1. The pk is in the target list.
2. The uk is in the target list and the columns is not null
3. The columns in group-by clause is also in the target list

for relation join, we can tell it by:
if every relation in the jointree yield a unique result set,then
the final result is unique as well regardless the join method.
---
 .../postgres_fdw/expected/postgres_fdw.out    |  28 +-
 src/backend/optimizer/path/costsize.c         |   1 +
 src/backend/optimizer/plan/analyzejoins.c     | 184 +++++++++++-
 src/backend/optimizer/plan/planner.c          |  27 ++
 src/backend/optimizer/util/plancat.c          |   9 +
 src/backend/utils/misc/guc.c                  |  10 +
 src/include/nodes/pathnodes.h                 |   1 +
 src/include/optimizer/cost.h                  |   1 +
 src/include/optimizer/planmain.h              |   2 +
 src/test/regress/expected/aggregates.out      |  13 +-
 src/test/regress/expected/join.out            |  16 +-
 src/test/regress/expected/select_distinct.out | 276 ++++++++++++++++++
 src/test/regress/expected/sysviews.out        |   3 +-
 src/test/regress/sql/select_distinct.sql      |  84 ++++++
 14 files changed, 619 insertions(+), 36 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 84fd3ad2e0..215f10bf7d 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -2902,22 +2902,20 @@ select sum(c1%3), sum(distinct c1%3 order by c1%3) filter (where c1%3 < 2), c2 f
 -- Outer query is aggregation query
 explain (verbose, costs off)
 select distinct (select count(*) filter (where t2.c2 = 6 and t2.c1 < 10) from ft1 t1 where t1.c1 = 6) from ft2 t2 where t2.c2 % 6 = 0 order by 1;
-                                                          QUERY PLAN                                                          
-------------------------------------------------------------------------------------------------------------------------------
- Unique
+                                                       QUERY PLAN                                                       
+------------------------------------------------------------------------------------------------------------------------
+ Sort
    Output: ((SubPlan 1))
-   ->  Sort
-         Output: ((SubPlan 1))
-         Sort Key: ((SubPlan 1))
-         ->  Foreign Scan
-               Output: (SubPlan 1)
-               Relations: Aggregate on (public.ft2 t2)
-               Remote SQL: SELECT count(*) FILTER (WHERE ((c2 = 6) AND ("C 1" < 10))) FROM "S 1"."T 1" WHERE (((c2 % 6) = 0))
-               SubPlan 1
-                 ->  Foreign Scan on public.ft1 t1
-                       Output: (count(*) FILTER (WHERE ((t2.c2 = 6) AND (t2.c1 < 10))))
-                       Remote SQL: SELECT NULL FROM "S 1"."T 1" WHERE (("C 1" = 6))
-(13 rows)
+   Sort Key: ((SubPlan 1))
+   ->  Foreign Scan
+         Output: (SubPlan 1)
+         Relations: Aggregate on (public.ft2 t2)
+         Remote SQL: SELECT count(*) FILTER (WHERE ((c2 = 6) AND ("C 1" < 10))) FROM "S 1"."T 1" WHERE (((c2 % 6) = 0))
+         SubPlan 1
+           ->  Foreign Scan on public.ft1 t1
+                 Output: (count(*) FILTER (WHERE ((t2.c2 = 6) AND (t2.c1 < 10))))
+                 Remote SQL: SELECT NULL FROM "S 1"."T 1" WHERE (("C 1" = 6))
+(11 rows)
 
 select distinct (select count(*) filter (where t2.c2 = 6 and t2.c1 < 10) from ft1 t1 where t1.c1 = 6) from ft2 t2 where t2.c2 % 6 = 0 order by 1;
  count 
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index b5a0033721..dde16b5d44 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -138,6 +138,7 @@ bool		enable_partitionwise_aggregate = false;
 bool		enable_parallel_append = true;
 bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
+bool		enable_distinct_elimination = true;
 
 typedef struct
 {
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index d0ff660284..dee152af29 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -30,6 +30,7 @@
 #include "optimizer/paths.h"
 #include "optimizer/planmain.h"
 #include "optimizer/tlist.h"
+#include "parser/parsetree.h"
 #include "utils/lsyscache.h"
 
 /* local functions */
@@ -47,7 +48,8 @@ static bool is_innerrel_unique_for(PlannerInfo *root,
 								   RelOptInfo *innerrel,
 								   JoinType jointype,
 								   List *restrictlist);
-
+static void transform_colno_for_subquery(Query *query, List *colnos, List *opids,
+										List **sub_colnos, List **sub_opids);
 
 /*
  * remove_useless_joins
@@ -801,9 +803,18 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 		if (l == NULL)			/* had matches for all? */
 			return true;
 	}
+	return query_is_distinct_agg(query, colnos, opids);
+}
+
+
+bool
+query_is_distinct_agg(Query *query, List *colnos, List *opids)
+{
+	ListCell   *l;
+	Oid			opid;
 
 	/*
-	 * Otherwise, a set-returning function in the query's targetlist can
+	 * a set-returning function in the query's targetlist can
 	 * result in returning duplicate rows, despite any grouping that might
 	 * occur before tlist evaluation.  (If all tlist SRFs are within GROUP BY
 	 * columns, it would be safe because they'd be expanded before grouping.
@@ -901,7 +912,6 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 				return true;
 		}
 	}
-
 	/*
 	 * XXX Are there any other cases in which we can easily see the result
 	 * must be distinct?
@@ -913,6 +923,174 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 	return false;
 }
 
+/*
+ * scan_non_semi_anti_relids
+ *
+ * Scan jointree to get rid of right table of semi/anti join rtindex.
+ */
+static void
+scan_non_semi_anti_relids(Node* jtnode, Relids* relids)
+{
+	if (jtnode == NULL)
+		return;
+
+	if (IsA(jtnode, RangeTblRef))
+	{
+		int			varno = ((RangeTblRef *) jtnode)->rtindex;
+
+		*relids = bms_add_member(*relids, varno);
+	}
+	else if (IsA(jtnode, FromExpr))
+	{
+		FromExpr   *f = (FromExpr *) jtnode;
+		ListCell   *l;
+
+		foreach(l, f->fromlist)
+			scan_non_semi_anti_relids(lfirst(l), relids);
+	}
+	else if (IsA(jtnode, JoinExpr))
+	{
+		JoinExpr   *j = (JoinExpr *) jtnode;
+
+		scan_non_semi_anti_relids(j->larg, relids);
+		if (j->jointype != JOIN_SEMI && j->jointype != JOIN_ANTI)
+		{
+			scan_non_semi_anti_relids(j->rarg, relids);
+		}
+	}
+	else
+		elog(ERROR, "unrecognized node type: %d",
+			 (int) nodeTag(jtnode));
+
+}
+
+/*
+ * transform_colno_for_subquery
+ */
+static void
+transform_colno_for_subquery(Query *query, List *colnos, List *opids,
+							List **sub_colnos, List **sub_opids)
+{
+	ListCell *lc1, *lc2;
+	TargetEntry *tle;
+
+	forboth(lc1, colnos, lc2, opids)
+	{
+		tle = get_tle_by_resno(query->targetList, lfirst_int(lc1));
+		Assert(IsA(tle->expr, Var));
+		*sub_colnos = lappend_int(*sub_colnos, ((Var*)tle->expr)->varattno);
+		*sub_opids = lappend_oid(*sub_opids, lfirst_oid(lc2));
+	}
+}
+
+/*
+ * query_distinct_through_join
+ * If every relation yields a unique result in the join, so the join result
+ * is unqiue as well. We need to distinguish right table in semi/anti
+ * join, which we don't care.
+ */
+bool
+query_distinct_through_join(PlannerInfo *root, List *colnos, List *opids)
+{
+	Query *query = root->parse;
+	Relids non_semi_anti_relids = NULL;
+
+    /* Used for relation_has_unique_for */
+	List **non_null_expr_per_table = NULL;
+	/* Used for query_is_distinct_for */
+	List **non_null_colno_per_table = NULL;
+	/* Used for both as above*/
+	List **non_null_opids_per_table = NULL;
+	/* Not null info from restrictinfo and catalog */
+	Bitmapset **non_null_varno_per_table = NULL;
+
+	int rt_index;
+	ListCell *lc1, *lc2;
+	RangeTblEntry *rte;
+	RelOptInfo *rel;
+	int max_rt_index = list_length(query->rtable) + 1;
+	
+	/* Remove the relids for the right table in semi/anti join */
+	scan_non_semi_anti_relids((Node*)query->jointree, &non_semi_anti_relids);
+
+	non_null_varno_per_table = palloc0(max_rt_index * sizeof(Bitmapset *));
+
+	foreach(lc1, find_nonnullable_vars(query->jointree->quals))
+	{
+		Var *var;
+		if (!IsA(lfirst(lc1), Var))
+			continue;
+		var = lfirst_node(Var, lc1);
+		if (var->varno == INNER_VAR ||
+			var->varno == OUTER_VAR ||
+			var->varno == INDEX_VAR)
+			continue;
+		non_null_varno_per_table[var->varno] = bms_add_member(
+			non_null_varno_per_table[var->varno], var->varattno);
+	}
+
+	/* Add the non null info in catalog */
+	rt_index = -1;
+	while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 )
+	{
+		non_null_varno_per_table[rt_index] = bms_join(non_null_varno_per_table[rt_index],
+													  root->simple_rel_array[rt_index]->not_null_cols_relids);
+	}
+
+	non_null_expr_per_table = palloc0(max_rt_index * sizeof(List *));
+	non_null_opids_per_table = palloc0(max_rt_index * sizeof(List *));
+	non_null_colno_per_table = palloc0(max_rt_index * sizeof(List *));
+
+	/* Filter out the nullable columns and split them per table*/
+	forboth(lc1, colnos, lc2, opids)
+	{
+		int colno = lfirst_int(lc1);
+		TargetEntry *tle = get_tle_by_resno(query->targetList, colno);
+		Var *var = NULL;
+		if (!IsA(tle->expr, Var))
+			continue;
+		var = (Var *)tle->expr;
+		if (!bms_is_member(var->varattno, non_null_varno_per_table[var->varno]))
+			continue;
+		non_null_expr_per_table[var->varno] = lappend(
+			non_null_expr_per_table[var->varno], tle->expr);
+		non_null_opids_per_table[var->varno] = lappend_oid(
+			non_null_opids_per_table[var->varno], lfirst_oid(lc2));
+		non_null_colno_per_table[var->varno] = lappend_int(
+			non_null_colno_per_table[var->varno],
+			colno);
+	}
+
+	/* Check if every relation yields a unqiue result, if anyone doesn't return false */
+	rt_index = -1;
+	while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 )
+	{
+		rte = root->simple_rte_array[rt_index];
+		rel = root->simple_rel_array[rt_index];
+		if (rte->rtekind == RTE_RELATION &&
+			relation_has_unique_index_for(root, rel, NIL,
+										  non_null_expr_per_table[rt_index],
+										  non_null_opids_per_table[rt_index]))
+			continue;
+		if (rte->rtekind == RTE_SUBQUERY &&
+			query_supports_distinctness(rte->subquery))
+		{
+			List *subquery_colnos = NIL;
+			List *subquery_opids = NIL;
+			transform_colno_for_subquery(root->parse,
+										non_null_colno_per_table[rt_index],
+										non_null_opids_per_table[rt_index],
+										&subquery_colnos,
+										&subquery_opids);
+			if (query_is_distinct_for(rte->subquery, subquery_colnos, subquery_opids))
+				continue;
+			return false;
+		}
+		return false;
+	}
+	return true;
+}
+
 /*
  * distinct_col_search - subroutine for query_is_distinct_for
  *
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index d6f2153593..9d56e6c88e 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4737,6 +4737,33 @@ create_distinct_paths(PlannerInfo *root,
 	Path	   *path;
 	ListCell   *lc;
 
+	if (enable_distinct_elimination)
+	{
+		List *colnos = NIL;
+		List *opnos = NIL;
+		ListCell *lc;
+
+		Assert(parse->distinctClause != NIL);
+
+		foreach(lc, parse->distinctClause)
+		{
+			SortGroupClause *sgc = lfirst_node(SortGroupClause, lc);
+			int idx = sgc->tleSortGroupRef;
+			TargetEntry *tle = get_tle_by_resno(parse->targetList, idx);
+			if (tle->resjunk)
+				continue;
+			/* even column x is not null, f(x) may be null as well, so ignore it */
+			if (!IsA(tle->expr, Var))
+				continue;
+			colnos = lappend_int(colnos, idx);
+			opnos = lappend_oid(opnos, sgc->eqop);
+		}
+
+		if ((query_supports_distinctness(parse)
+			 && query_is_distinct_agg(parse, colnos, opnos)) ||
+			query_distinct_through_join(root, colnos, opnos))
+			return input_rel;
+	}
 	/* For now, do all work in the (DISTINCT, NULL) upperrel */
 	distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL);
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index d82fc5ab8b..e57b456d9b 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -117,6 +117,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	Relation	relation;
 	bool		hasindex;
 	List	   *indexinfos = NIL;
+	int        i;
 
 	/*
 	 * We need not lock the relation since it was already locked, either by
@@ -460,6 +461,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	if (inhparent && relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 		set_relation_partition_info(root, rel, relation);
 
+	Assert(rel->not_null_cols_relids == NULL);
+	for(i = 0; i < relation->rd_att->natts;  i++)
+	{
+		if (!relation->rd_att->attrs[i].attnotnull)
+			continue;
+		rel->not_null_cols_relids = bms_add_member(rel->not_null_cols_relids, i+1);
+	}
+
 	table_close(relation, NoLock);
 
 	/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index e44f71e991..fa798dd564 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1064,6 +1064,16 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_distinct_elimination", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables plan-time and run-time unique elimination."),
+		    gettext_noop("Allows the query planner to remove the uncecessary distinct clause."), 
+			GUC_EXPLAIN
+		},
+		&enable_distinct_elimination,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
 			gettext_noop("Enables genetic query optimization."),
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 3d3be197e0..51db013f5d 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -687,6 +687,7 @@ typedef struct RelOptInfo
 	PlannerInfo *subroot;		/* if subquery */
 	List	   *subplan_params; /* if subquery */
 	int			rel_parallel_workers;	/* wanted number of parallel workers */
+	Relids     not_null_cols_relids; /* not null cols by catalogs,starts with 1 */
 
 	/* Information about foreign tables and foreign joins */
 	Oid			serverid;		/* identifies server for the table or join */
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index cb012ba198..4fa5d32df6 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -64,6 +64,7 @@ extern PGDLLIMPORT bool enable_partitionwise_aggregate;
 extern PGDLLIMPORT bool enable_parallel_append;
 extern PGDLLIMPORT bool enable_parallel_hash;
 extern PGDLLIMPORT bool enable_partition_pruning;
+extern PGDLLIMPORT bool enable_distinct_elimination;
 extern PGDLLIMPORT int constraint_exclusion;
 
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index eab486a621..ebd4f24577 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -100,6 +100,8 @@ extern List *remove_useless_joins(PlannerInfo *root, List *joinlist);
 extern void reduce_unique_semijoins(PlannerInfo *root);
 extern bool query_supports_distinctness(Query *query);
 extern bool query_is_distinct_for(Query *query, List *colnos, List *opids);
+extern bool query_is_distinct_agg(Query *query, List *colnos, List *opids);
+extern bool query_distinct_through_join(PlannerInfo *root, List *colnos, List *opids);
 extern bool innerrel_is_unique(PlannerInfo *root,
 							   Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel,
 							   JoinType jointype, List *restrictlist, bool force_cache);
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index f457b5b150..6712571578 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -870,14 +870,12 @@ explain (costs off)
   select distinct max(unique2) from tenk1;
                              QUERY PLAN                              
 ---------------------------------------------------------------------
- HashAggregate
-   Group Key: $0
+ Result
    InitPlan 1 (returns $0)
      ->  Limit
            ->  Index Only Scan Backward using tenk1_unique2 on tenk1
                  Index Cond: (unique2 IS NOT NULL)
-   ->  Result
-(7 rows)
+(5 rows)
 
 select distinct max(unique2) from tenk1;
  max  
@@ -1036,7 +1034,7 @@ explain (costs off)
   select distinct min(f1), max(f1) from minmaxtest;
                                          QUERY PLAN                                          
 ---------------------------------------------------------------------------------------------
- Unique
+ Result
    InitPlan 1 (returns $0)
      ->  Limit
            ->  Merge Append
@@ -1059,10 +1057,7 @@ explain (costs off)
                  ->  Index Only Scan using minmaxtest2i on minmaxtest2 minmaxtest_8
                        Index Cond: (f1 IS NOT NULL)
                  ->  Index Only Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest_9
-   ->  Sort
-         Sort Key: ($0), ($1)
-         ->  Result
-(26 rows)
+(23 rows)
 
 select distinct min(f1), max(f1) from minmaxtest;
  min | max 
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 761376b007..3f6595d53b 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4433,17 +4433,17 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s
 explain (costs off)
 select d.* from d left join (select distinct * from b) s
   on d.a = s.id;
-              QUERY PLAN              
---------------------------------------
- Merge Right Join
-   Merge Cond: (b.id = d.a)
-   ->  Unique
-         ->  Sort
-               Sort Key: b.id, b.c_id
-               ->  Seq Scan on b
+           QUERY PLAN            
+---------------------------------
+ Merge Left Join
+   Merge Cond: (d.a = s.id)
    ->  Sort
          Sort Key: d.a
          ->  Seq Scan on d
+   ->  Sort
+         Sort Key: s.id
+         ->  Subquery Scan on s
+               ->  Seq Scan on b
 (9 rows)
 
 -- check join removal works when uniqueness of the join condition is enforced
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index f3696c6d1d..73729c8606 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -244,3 +244,279 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
  t
 (1 row)
 
+create table select_distinct_a(pk1 int, pk2 char(20),  uk1 char(20) not null,  uk2 int, e int, primary key(pk1, pk2));
+create unique index select_distinct_a_uk on select_distinct_a(uk1, uk2);
+create table select_distinct_b(a int, b char(20), pk1 char(20), pk2 int, e int, primary key(pk1, pk2));
+-- distinct erased since (pk1, pk2)
+explain (costs off) select distinct * from select_distinct_a;
+          QUERY PLAN           
+-------------------------------
+ Seq Scan on select_distinct_a
+(1 row)
+
+-- distinct can't be reased since since we required all the uk must be not null
+explain (costs off) select distinct uk1, uk2 from select_distinct_a;
+             QUERY PLAN              
+-------------------------------------
+ HashAggregate
+   Group Key: uk1, uk2
+   ->  Seq Scan on select_distinct_a
+(3 rows)
+
+-- distinct ereased since uk + not null
+explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 is not null;
+          QUERY PLAN           
+-------------------------------
+ Seq Scan on select_distinct_a
+   Filter: (uk2 IS NOT NULL)
+(2 rows)
+
+explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 > 1;
+          QUERY PLAN           
+-------------------------------
+ Seq Scan on select_distinct_a
+   Filter: (uk2 > 1)
+(2 rows)
+
+-- distinct erased due to group by
+explain select distinct e from select_distinct_a group by e;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ HashAggregate  (cost=14.88..16.88 rows=200 width=4)
+   Group Key: e
+   ->  Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=390 width=4)
+(3 rows)
+
+-- distinct erased due to the restirctinfo
+explain select distinct uk1 from select_distinct_a where pk1 = 1 and pk2 = 'c';
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Index Scan using select_distinct_a_pkey on select_distinct_a  (cost=0.15..8.17 rows=1 width=84)
+   Index Cond: ((pk1 = 1) AND (pk2 = 'c'::bpchar))
+(2 rows)
+
+-- test join
+set enable_mergejoin to off;
+set enable_hashjoin to off;
+insert into select_distinct_a values(1, 'a', 'a', 0, 1), (1, 'b', 'A', 0, 2), (3, 'c', 'c', 0, 3);
+insert into select_distinct_b values(1, 'a', 'a', 0, 1), (4, 'd', 'd', 0, 4), (1, 'e', 'e', 0, 5);
+-- Cartesian join
+explain (costs off) select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null;
+                 QUERY PLAN                  
+---------------------------------------------
+ Nested Loop
+   ->  Seq Scan on select_distinct_b b
+   ->  Materialize
+         ->  Seq Scan on select_distinct_a a
+               Filter: (uk2 IS NOT NULL)
+(5 rows)
+
+select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null order by 1, 2, 3, 4;
+         uk1          | uk2 |         pk1          | pk2 
+----------------------+-----+----------------------+-----
+ a                    |   0 | a                    |   0
+ a                    |   0 | d                    |   0
+ a                    |   0 | e                    |   0
+ A                    |   0 | a                    |   0
+ A                    |   0 | d                    |   0
+ A                    |   0 | e                    |   0
+ c                    |   0 | a                    |   0
+ c                    |   0 | d                    |   0
+ c                    |   0 | e                    |   0
+(9 rows)
+
+-- left join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a);
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Nested Loop Left Join  (cost=0.00..2310.28 rows=760 width=176)
+   Join Filter: (a.pk1 = b.a)
+   ->  Seq Scan on select_distinct_a a  (cost=0.00..13.90 rows=390 width=88)
+   ->  Materialize  (cost=0.00..15.85 rows=390 width=92)
+         ->  Seq Scan on select_distinct_b b  (cost=0.00..13.90 rows=390 width=92)
+(5 rows)
+
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;;
+ pk1 |         pk2          |         pk1          | pk2 
+-----+----------------------+----------------------+-----
+   1 | a                    | a                    |   0
+   1 | a                    | e                    |   0
+   1 | b                    | a                    |   0
+   1 | b                    | e                    |   0
+   3 | c                    |                      |    
+(5 rows)
+
+-- right join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a);
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Nested Loop Left Join  (cost=0.15..140.88 rows=760 width=176)
+   ->  Seq Scan on select_distinct_b b  (cost=0.00..13.90 rows=390 width=92)
+   ->  Index Only Scan using select_distinct_a_pkey on select_distinct_a a  (cost=0.15..0.31 rows=2 width=88)
+         Index Cond: (pk1 = b.a)
+(4 rows)
+
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;
+ pk1 |         pk2          |         pk1          | pk2 
+-----+----------------------+----------------------+-----
+   1 | a                    | a                    |   0
+   1 | a                    | e                    |   0
+   1 | b                    | a                    |   0
+   1 | b                    | e                    |   0
+     |                      | d                    |   0
+(5 rows)
+
+-- full join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a);
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Hash Full Join  (cost=10000000018.77..10000000060.26 rows=760 width=176)
+   Hash Cond: (a.pk1 = b.a)
+   ->  Seq Scan on select_distinct_a a  (cost=0.00..13.90 rows=390 width=88)
+   ->  Hash  (cost=13.90..13.90 rows=390 width=92)
+         ->  Seq Scan on select_distinct_b b  (cost=0.00..13.90 rows=390 width=92)
+(5 rows)
+
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;
+ pk1 |         pk2          |         pk1          | pk2 
+-----+----------------------+----------------------+-----
+   1 | a                    | a                    |   0
+   1 | a                    | e                    |   0
+   1 | b                    | a                    |   0
+   1 | b                    | e                    |   0
+   3 | c                    |                      |    
+     |                      | d                    |   0
+(6 rows)
+
+-- distinct can't be erased since b.pk2 is missed
+explain select distinct a.pk1, a.pk2, b.pk1 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a);
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Unique  (cost=10000000096.63..10000000104.23 rows=760 width=172)
+   ->  Sort  (cost=10000000096.63..10000000098.53 rows=760 width=172)
+         Sort Key: a.pk1, a.pk2, b.pk1
+         ->  Hash Full Join  (cost=10000000018.77..10000000060.26 rows=760 width=172)
+               Hash Cond: (a.pk1 = b.a)
+               ->  Seq Scan on select_distinct_a a  (cost=0.00..13.90 rows=390 width=88)
+               ->  Hash  (cost=13.90..13.90 rows=390 width=88)
+                     ->  Seq Scan on select_distinct_b b  (cost=0.00..13.90 rows=390 width=88)
+(8 rows)
+
+-- Semi/anti join
+explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 in (select a from select_distinct_b);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop
+   ->  HashAggregate
+         Group Key: select_distinct_b.a
+         ->  Seq Scan on select_distinct_b
+   ->  Index Only Scan using select_distinct_a_pkey on select_distinct_a
+         Index Cond: (pk1 = select_distinct_b.a)
+(6 rows)
+
+explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 not in (select a from select_distinct_b);
+              QUERY PLAN               
+---------------------------------------
+ Seq Scan on select_distinct_a
+   Filter: (NOT (hashed SubPlan 1))
+   SubPlan 1
+     ->  Seq Scan on select_distinct_b
+(4 rows)
+
+-- we also can handle some limited subquery
+explain select distinct * from select_distinct_a a,  (select a from select_distinct_b group by a) b where a.pk1 = b.a;
+                                                QUERY PLAN                                                
+----------------------------------------------------------------------------------------------------------
+ Nested Loop  (cost=15.02..107.38 rows=390 width=184)
+   ->  HashAggregate  (cost=14.88..16.88 rows=200 width=4)
+         Group Key: select_distinct_b.a
+         ->  Seq Scan on select_distinct_b  (cost=0.00..13.90 rows=390 width=4)
+   ->  Index Scan using select_distinct_a_pkey on select_distinct_a a  (cost=0.15..0.42 rows=2 width=180)
+         Index Cond: (pk1 = select_distinct_b.a)
+(6 rows)
+
+select distinct * from select_distinct_a a,  (select a from select_distinct_b group by a) b where a.pk1 = b.a order by 1, 2, 3;
+ pk1 |         pk2          |         uk1          | uk2 | e | a 
+-----+----------------------+----------------------+-----+---+---
+   1 | a                    | a                    |   0 | 1 | 1
+   1 | b                    | A                    |   0 | 2 | 1
+(2 rows)
+
+explain select distinct * from select_distinct_a a,  (select distinct a from select_distinct_b) b where a.pk1 = b.a;
+                                                QUERY PLAN                                                
+----------------------------------------------------------------------------------------------------------
+ Nested Loop  (cost=15.02..107.38 rows=390 width=184)
+   ->  HashAggregate  (cost=14.88..16.88 rows=200 width=4)
+         Group Key: select_distinct_b.a
+         ->  Seq Scan on select_distinct_b  (cost=0.00..13.90 rows=390 width=4)
+   ->  Index Scan using select_distinct_a_pkey on select_distinct_a a  (cost=0.15..0.42 rows=2 width=180)
+         Index Cond: (pk1 = select_distinct_b.a)
+(6 rows)
+
+select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a order by 1 ,2, 3;
+ pk1 |         pk2          |         uk1          | uk2 | e | a 
+-----+----------------------+----------------------+-----+---+---
+   1 | a                    | a                    |   0 | 1 | 1
+   1 | b                    | A                    |   0 | 2 | 1
+(2 rows)
+
+-- Distinct On
+-- can't erase since pk2 is missed
+explain (costs off) select distinct on(pk1) pk1, pk2 from select_distinct_a;
+                QUERY PLAN                 
+-------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: pk1
+         ->  Seq Scan on select_distinct_a
+(4 rows)
+
+-- ok to erase
+explain (costs off) select distinct on(pk1, pk2) pk1, pk2 from select_distinct_a;
+          QUERY PLAN           
+-------------------------------
+ Seq Scan on select_distinct_a
+(1 row)
+
+-- test some view.
+create view distinct_v1 as select distinct uk1, uk2 from select_distinct_a where uk2 is not null;
+explain select * from distinct_v1;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=388 width=88)
+   Filter: (uk2 IS NOT NULL)
+(2 rows)
+
+alter table select_distinct_a alter column uk1 drop not null;
+explain select * from distinct_v1;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ HashAggregate  (cost=15.84..17.84 rows=200 width=88)
+   Group Key: select_distinct_a.uk1, select_distinct_a.uk2
+   ->  Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=388 width=88)
+         Filter: (uk2 IS NOT NULL)
+(4 rows)
+
+alter table select_distinct_a alter column uk1 set not null;
+-- test generic plan
+prepare pt as select * from distinct_v1;
+explain execute pt;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=388 width=88)
+   Filter: (uk2 IS NOT NULL)
+(2 rows)
+
+alter table select_distinct_a alter column uk1 drop not null;
+explain execute pt;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ HashAggregate  (cost=15.84..17.84 rows=200 width=88)
+   Group Key: select_distinct_a.uk1, select_distinct_a.uk2
+   ->  Seq Scan on select_distinct_a  (cost=0.00..13.90 rows=388 width=88)
+         Filter: (uk2 IS NOT NULL)
+(4 rows)
+
+drop view distinct_v1;
+drop table select_distinct_a;
+drop table select_distinct_b;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index a1c90eb905..e053214f9d 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -73,6 +73,7 @@ select name, setting from pg_settings where name like 'enable%';
               name              | setting 
 --------------------------------+---------
  enable_bitmapscan              | on
+ enable_distinct_elimination    | on
  enable_gathermerge             | on
  enable_hashagg                 | on
  enable_hashjoin                | on
@@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(17 rows)
+(18 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index a605e86449..813361ad89 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -73,3 +73,87 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
 SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
 SELECT 2 IS NOT DISTINCT FROM null as "no";
 SELECT null IS NOT DISTINCT FROM null as "yes";
+create table select_distinct_a(pk1 int, pk2 char(20),  uk1 char(20) not null,  uk2 int, e int, primary key(pk1, pk2));
+create unique index select_distinct_a_uk on select_distinct_a(uk1, uk2);
+create table select_distinct_b(a int, b char(20), pk1 char(20), pk2 int, e int, primary key(pk1, pk2));
+
+-- distinct erased since (pk1, pk2)
+explain (costs off) select distinct * from select_distinct_a;
+
+-- distinct can't be reased since since we required all the uk must be not null
+explain (costs off) select distinct uk1, uk2 from select_distinct_a;
+
+-- distinct ereased since uk + not null
+explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 is not null;
+explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 > 1;
+
+-- distinct erased due to group by
+explain select distinct e from select_distinct_a group by e;
+
+-- distinct erased due to the restirctinfo
+explain select distinct uk1 from select_distinct_a where pk1 = 1 and pk2 = 'c';
+
+-- test join
+set enable_mergejoin to off;
+set enable_hashjoin to off;
+
+insert into select_distinct_a values(1, 'a', 'a', 0, 1), (1, 'b', 'A', 0, 2), (3, 'c', 'c', 0, 3);
+insert into select_distinct_b values(1, 'a', 'a', 0, 1), (4, 'd', 'd', 0, 4), (1, 'e', 'e', 0, 5);
+
+-- Cartesian join
+explain (costs off) select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null;
+select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null order by 1, 2, 3, 4;
+
+
+-- left join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a);
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;;
+
+-- right join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a);
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;
+
+-- full join
+explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a);
+select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;
+
+-- distinct can't be erased since b.pk2 is missed
+explain select distinct a.pk1, a.pk2, b.pk1 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a);
+
+
+-- Semi/anti join
+explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 in (select a from select_distinct_b);
+explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 not in (select a from select_distinct_b);
+
+-- we also can handle some limited subquery
+explain select distinct * from select_distinct_a a,  (select a from select_distinct_b group by a) b where a.pk1 = b.a;
+select distinct * from select_distinct_a a,  (select a from select_distinct_b group by a) b where a.pk1 = b.a order by 1, 2, 3;
+
+explain select distinct * from select_distinct_a a,  (select distinct a from select_distinct_b) b where a.pk1 = b.a;
+select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a order by 1 ,2, 3;
+
+-- Distinct On
+-- can't erase since pk2 is missed
+explain (costs off) select distinct on(pk1) pk1, pk2 from select_distinct_a;
+-- ok to erase
+explain (costs off) select distinct on(pk1, pk2) pk1, pk2 from select_distinct_a;
+
+
+-- test some view.
+create view distinct_v1 as select distinct uk1, uk2 from select_distinct_a where uk2 is not null;
+explain select * from distinct_v1;
+
+alter table select_distinct_a alter column uk1 drop not null;
+explain select * from distinct_v1;
+
+alter table select_distinct_a alter column uk1 set not null;
+
+-- test generic plan
+prepare pt as select * from distinct_v1;
+explain execute pt;
+alter table select_distinct_a alter column uk1 drop not null;
+explain execute pt;
+
+drop view distinct_v1;
+drop table select_distinct_a;
+drop table select_distinct_b;
-- 
2.21.0