From 9449c09688d542c4dc201ee866f67d67304cff98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=80=E6=8C=83?= Date: Wed, 4 Mar 2020 14:33:56 +0800 Subject: [PATCH v2] [PATCH] Erase the distinct path if the result is unique by catalog For a single relation, we can tell it by any one of the following is true: 1. The pk is in the target list. 2. The uk is in the target list and the columns is not null 3. The columns in group-by clause is also in the target list for relation join, we can tell it by: if every relation in the jointree yield a unique result set,then the final result is unique as well regardless the join method. --- .../postgres_fdw/expected/postgres_fdw.out | 28 +- src/backend/optimizer/path/costsize.c | 1 + src/backend/optimizer/plan/analyzejoins.c | 184 +++++++++++- src/backend/optimizer/plan/planner.c | 27 ++ src/backend/optimizer/util/plancat.c | 9 + src/backend/utils/misc/guc.c | 10 + src/include/nodes/pathnodes.h | 1 + src/include/optimizer/cost.h | 1 + src/include/optimizer/planmain.h | 2 + src/test/regress/expected/aggregates.out | 13 +- src/test/regress/expected/join.out | 16 +- src/test/regress/expected/select_distinct.out | 276 ++++++++++++++++++ src/test/regress/expected/sysviews.out | 3 +- src/test/regress/sql/select_distinct.sql | 84 ++++++ 14 files changed, 619 insertions(+), 36 deletions(-) diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 84fd3ad2e0..215f10bf7d 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -2902,22 +2902,20 @@ select sum(c1%3), sum(distinct c1%3 order by c1%3) filter (where c1%3 < 2), c2 f -- Outer query is aggregation query explain (verbose, costs off) select distinct (select count(*) filter (where t2.c2 = 6 and t2.c1 < 10) from ft1 t1 where t1.c1 = 6) from ft2 t2 where t2.c2 % 6 = 0 order by 1; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------- - Unique + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ + Sort Output: ((SubPlan 1)) - -> Sort - Output: ((SubPlan 1)) - Sort Key: ((SubPlan 1)) - -> Foreign Scan - Output: (SubPlan 1) - Relations: Aggregate on (public.ft2 t2) - Remote SQL: SELECT count(*) FILTER (WHERE ((c2 = 6) AND ("C 1" < 10))) FROM "S 1"."T 1" WHERE (((c2 % 6) = 0)) - SubPlan 1 - -> Foreign Scan on public.ft1 t1 - Output: (count(*) FILTER (WHERE ((t2.c2 = 6) AND (t2.c1 < 10)))) - Remote SQL: SELECT NULL FROM "S 1"."T 1" WHERE (("C 1" = 6)) -(13 rows) + Sort Key: ((SubPlan 1)) + -> Foreign Scan + Output: (SubPlan 1) + Relations: Aggregate on (public.ft2 t2) + Remote SQL: SELECT count(*) FILTER (WHERE ((c2 = 6) AND ("C 1" < 10))) FROM "S 1"."T 1" WHERE (((c2 % 6) = 0)) + SubPlan 1 + -> Foreign Scan on public.ft1 t1 + Output: (count(*) FILTER (WHERE ((t2.c2 = 6) AND (t2.c1 < 10)))) + Remote SQL: SELECT NULL FROM "S 1"."T 1" WHERE (("C 1" = 6)) +(11 rows) select distinct (select count(*) filter (where t2.c2 = 6 and t2.c1 < 10) from ft1 t1 where t1.c1 = 6) from ft2 t2 where t2.c2 % 6 = 0 order by 1; count diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index b5a0033721..dde16b5d44 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -138,6 +138,7 @@ bool enable_partitionwise_aggregate = false; bool enable_parallel_append = true; bool enable_parallel_hash = true; bool enable_partition_pruning = true; +bool enable_distinct_elimination = true; typedef struct { diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index d0ff660284..dee152af29 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -30,6 +30,7 @@ #include "optimizer/paths.h" #include "optimizer/planmain.h" #include "optimizer/tlist.h" +#include "parser/parsetree.h" #include "utils/lsyscache.h" /* local functions */ @@ -47,7 +48,8 @@ static bool is_innerrel_unique_for(PlannerInfo *root, RelOptInfo *innerrel, JoinType jointype, List *restrictlist); - +static void transform_colno_for_subquery(Query *query, List *colnos, List *opids, + List **sub_colnos, List **sub_opids); /* * remove_useless_joins @@ -801,9 +803,18 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) if (l == NULL) /* had matches for all? */ return true; } + return query_is_distinct_agg(query, colnos, opids); +} + + +bool +query_is_distinct_agg(Query *query, List *colnos, List *opids) +{ + ListCell *l; + Oid opid; /* - * Otherwise, a set-returning function in the query's targetlist can + * a set-returning function in the query's targetlist can * result in returning duplicate rows, despite any grouping that might * occur before tlist evaluation. (If all tlist SRFs are within GROUP BY * columns, it would be safe because they'd be expanded before grouping. @@ -901,7 +912,6 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) return true; } } - /* * XXX Are there any other cases in which we can easily see the result * must be distinct? @@ -913,6 +923,174 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) return false; } +/* + * scan_non_semi_anti_relids + * + * Scan jointree to get rid of right table of semi/anti join rtindex. + */ +static void +scan_non_semi_anti_relids(Node* jtnode, Relids* relids) +{ + if (jtnode == NULL) + return; + + if (IsA(jtnode, RangeTblRef)) + { + int varno = ((RangeTblRef *) jtnode)->rtindex; + + *relids = bms_add_member(*relids, varno); + } + else if (IsA(jtnode, FromExpr)) + { + FromExpr *f = (FromExpr *) jtnode; + ListCell *l; + + foreach(l, f->fromlist) + scan_non_semi_anti_relids(lfirst(l), relids); + } + else if (IsA(jtnode, JoinExpr)) + { + JoinExpr *j = (JoinExpr *) jtnode; + + scan_non_semi_anti_relids(j->larg, relids); + if (j->jointype != JOIN_SEMI && j->jointype != JOIN_ANTI) + { + scan_non_semi_anti_relids(j->rarg, relids); + } + } + else + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(jtnode)); + +} + +/* + * transform_colno_for_subquery + */ +static void +transform_colno_for_subquery(Query *query, List *colnos, List *opids, + List **sub_colnos, List **sub_opids) +{ + ListCell *lc1, *lc2; + TargetEntry *tle; + + forboth(lc1, colnos, lc2, opids) + { + tle = get_tle_by_resno(query->targetList, lfirst_int(lc1)); + Assert(IsA(tle->expr, Var)); + *sub_colnos = lappend_int(*sub_colnos, ((Var*)tle->expr)->varattno); + *sub_opids = lappend_oid(*sub_opids, lfirst_oid(lc2)); + } +} + +/* + * query_distinct_through_join + * If every relation yields a unique result in the join, so the join result + * is unqiue as well. We need to distinguish right table in semi/anti + * join, which we don't care. + */ +bool +query_distinct_through_join(PlannerInfo *root, List *colnos, List *opids) +{ + Query *query = root->parse; + Relids non_semi_anti_relids = NULL; + + /* Used for relation_has_unique_for */ + List **non_null_expr_per_table = NULL; + /* Used for query_is_distinct_for */ + List **non_null_colno_per_table = NULL; + /* Used for both as above*/ + List **non_null_opids_per_table = NULL; + /* Not null info from restrictinfo and catalog */ + Bitmapset **non_null_varno_per_table = NULL; + + int rt_index; + ListCell *lc1, *lc2; + RangeTblEntry *rte; + RelOptInfo *rel; + int max_rt_index = list_length(query->rtable) + 1; + + /* Remove the relids for the right table in semi/anti join */ + scan_non_semi_anti_relids((Node*)query->jointree, &non_semi_anti_relids); + + non_null_varno_per_table = palloc0(max_rt_index * sizeof(Bitmapset *)); + + foreach(lc1, find_nonnullable_vars(query->jointree->quals)) + { + Var *var; + if (!IsA(lfirst(lc1), Var)) + continue; + var = lfirst_node(Var, lc1); + if (var->varno == INNER_VAR || + var->varno == OUTER_VAR || + var->varno == INDEX_VAR) + continue; + non_null_varno_per_table[var->varno] = bms_add_member( + non_null_varno_per_table[var->varno], var->varattno); + } + + /* Add the non null info in catalog */ + rt_index = -1; + while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 ) + { + non_null_varno_per_table[rt_index] = bms_join(non_null_varno_per_table[rt_index], + root->simple_rel_array[rt_index]->not_null_cols_relids); + } + + non_null_expr_per_table = palloc0(max_rt_index * sizeof(List *)); + non_null_opids_per_table = palloc0(max_rt_index * sizeof(List *)); + non_null_colno_per_table = palloc0(max_rt_index * sizeof(List *)); + + /* Filter out the nullable columns and split them per table*/ + forboth(lc1, colnos, lc2, opids) + { + int colno = lfirst_int(lc1); + TargetEntry *tle = get_tle_by_resno(query->targetList, colno); + Var *var = NULL; + if (!IsA(tle->expr, Var)) + continue; + var = (Var *)tle->expr; + if (!bms_is_member(var->varattno, non_null_varno_per_table[var->varno])) + continue; + non_null_expr_per_table[var->varno] = lappend( + non_null_expr_per_table[var->varno], tle->expr); + non_null_opids_per_table[var->varno] = lappend_oid( + non_null_opids_per_table[var->varno], lfirst_oid(lc2)); + non_null_colno_per_table[var->varno] = lappend_int( + non_null_colno_per_table[var->varno], + colno); + } + + /* Check if every relation yields a unqiue result, if anyone doesn't return false */ + rt_index = -1; + while ((rt_index = bms_next_member(non_semi_anti_relids, rt_index)) >= 0 ) + { + rte = root->simple_rte_array[rt_index]; + rel = root->simple_rel_array[rt_index]; + if (rte->rtekind == RTE_RELATION && + relation_has_unique_index_for(root, rel, NIL, + non_null_expr_per_table[rt_index], + non_null_opids_per_table[rt_index])) + continue; + if (rte->rtekind == RTE_SUBQUERY && + query_supports_distinctness(rte->subquery)) + { + List *subquery_colnos = NIL; + List *subquery_opids = NIL; + transform_colno_for_subquery(root->parse, + non_null_colno_per_table[rt_index], + non_null_opids_per_table[rt_index], + &subquery_colnos, + &subquery_opids); + if (query_is_distinct_for(rte->subquery, subquery_colnos, subquery_opids)) + continue; + return false; + } + return false; + } + return true; +} + /* * distinct_col_search - subroutine for query_is_distinct_for * diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index d6f2153593..9d56e6c88e 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4737,6 +4737,33 @@ create_distinct_paths(PlannerInfo *root, Path *path; ListCell *lc; + if (enable_distinct_elimination) + { + List *colnos = NIL; + List *opnos = NIL; + ListCell *lc; + + Assert(parse->distinctClause != NIL); + + foreach(lc, parse->distinctClause) + { + SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); + int idx = sgc->tleSortGroupRef; + TargetEntry *tle = get_tle_by_resno(parse->targetList, idx); + if (tle->resjunk) + continue; + /* even column x is not null, f(x) may be null as well, so ignore it */ + if (!IsA(tle->expr, Var)) + continue; + colnos = lappend_int(colnos, idx); + opnos = lappend_oid(opnos, sgc->eqop); + } + + if ((query_supports_distinctness(parse) + && query_is_distinct_agg(parse, colnos, opnos)) || + query_distinct_through_join(root, colnos, opnos)) + return input_rel; + } /* For now, do all work in the (DISTINCT, NULL) upperrel */ distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL); diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index d82fc5ab8b..e57b456d9b 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -117,6 +117,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, Relation relation; bool hasindex; List *indexinfos = NIL; + int i; /* * We need not lock the relation since it was already locked, either by @@ -460,6 +461,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, if (inhparent && relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) set_relation_partition_info(root, rel, relation); + Assert(rel->not_null_cols_relids == NULL); + for(i = 0; i < relation->rd_att->natts; i++) + { + if (!relation->rd_att->attrs[i].attnotnull) + continue; + rel->not_null_cols_relids = bms_add_member(rel->not_null_cols_relids, i+1); + } + table_close(relation, NoLock); /* diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index e44f71e991..fa798dd564 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1064,6 +1064,16 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_distinct_elimination", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables plan-time and run-time unique elimination."), + gettext_noop("Allows the query planner to remove the uncecessary distinct clause."), + GUC_EXPLAIN + }, + &enable_distinct_elimination, + true, + NULL, NULL, NULL + }, { {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, gettext_noop("Enables genetic query optimization."), diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 3d3be197e0..51db013f5d 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -687,6 +687,7 @@ typedef struct RelOptInfo PlannerInfo *subroot; /* if subquery */ List *subplan_params; /* if subquery */ int rel_parallel_workers; /* wanted number of parallel workers */ + Relids not_null_cols_relids; /* not null cols by catalogs,starts with 1 */ /* Information about foreign tables and foreign joins */ Oid serverid; /* identifies server for the table or join */ diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index cb012ba198..4fa5d32df6 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -64,6 +64,7 @@ extern PGDLLIMPORT bool enable_partitionwise_aggregate; extern PGDLLIMPORT bool enable_parallel_append; extern PGDLLIMPORT bool enable_parallel_hash; extern PGDLLIMPORT bool enable_partition_pruning; +extern PGDLLIMPORT bool enable_distinct_elimination; extern PGDLLIMPORT int constraint_exclusion; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index eab486a621..ebd4f24577 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -100,6 +100,8 @@ extern List *remove_useless_joins(PlannerInfo *root, List *joinlist); extern void reduce_unique_semijoins(PlannerInfo *root); extern bool query_supports_distinctness(Query *query); extern bool query_is_distinct_for(Query *query, List *colnos, List *opids); +extern bool query_is_distinct_agg(Query *query, List *colnos, List *opids); +extern bool query_distinct_through_join(PlannerInfo *root, List *colnos, List *opids); extern bool innerrel_is_unique(PlannerInfo *root, Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel, JoinType jointype, List *restrictlist, bool force_cache); diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index f457b5b150..6712571578 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -870,14 +870,12 @@ explain (costs off) select distinct max(unique2) from tenk1; QUERY PLAN --------------------------------------------------------------------- - HashAggregate - Group Key: $0 + Result InitPlan 1 (returns $0) -> Limit -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result -(7 rows) +(5 rows) select distinct max(unique2) from tenk1; max @@ -1036,7 +1034,7 @@ explain (costs off) select distinct min(f1), max(f1) from minmaxtest; QUERY PLAN --------------------------------------------------------------------------------------------- - Unique + Result InitPlan 1 (returns $0) -> Limit -> Merge Append @@ -1059,10 +1057,7 @@ explain (costs off) -> Index Only Scan using minmaxtest2i on minmaxtest2 minmaxtest_8 Index Cond: (f1 IS NOT NULL) -> Index Only Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest_9 - -> Sort - Sort Key: ($0), ($1) - -> Result -(26 rows) +(23 rows) select distinct min(f1), max(f1) from minmaxtest; min | max diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 761376b007..3f6595d53b 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4433,17 +4433,17 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s explain (costs off) select d.* from d left join (select distinct * from b) s on d.a = s.id; - QUERY PLAN --------------------------------------- - Merge Right Join - Merge Cond: (b.id = d.a) - -> Unique - -> Sort - Sort Key: b.id, b.c_id - -> Seq Scan on b + QUERY PLAN +--------------------------------- + Merge Left Join + Merge Cond: (d.a = s.id) -> Sort Sort Key: d.a -> Seq Scan on d + -> Sort + Sort Key: s.id + -> Subquery Scan on s + -> Seq Scan on b (9 rows) -- check join removal works when uniqueness of the join condition is enforced diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index f3696c6d1d..73729c8606 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -244,3 +244,279 @@ SELECT null IS NOT DISTINCT FROM null as "yes"; t (1 row) +create table select_distinct_a(pk1 int, pk2 char(20), uk1 char(20) not null, uk2 int, e int, primary key(pk1, pk2)); +create unique index select_distinct_a_uk on select_distinct_a(uk1, uk2); +create table select_distinct_b(a int, b char(20), pk1 char(20), pk2 int, e int, primary key(pk1, pk2)); +-- distinct erased since (pk1, pk2) +explain (costs off) select distinct * from select_distinct_a; + QUERY PLAN +------------------------------- + Seq Scan on select_distinct_a +(1 row) + +-- distinct can't be reased since since we required all the uk must be not null +explain (costs off) select distinct uk1, uk2 from select_distinct_a; + QUERY PLAN +------------------------------------- + HashAggregate + Group Key: uk1, uk2 + -> Seq Scan on select_distinct_a +(3 rows) + +-- distinct ereased since uk + not null +explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 is not null; + QUERY PLAN +------------------------------- + Seq Scan on select_distinct_a + Filter: (uk2 IS NOT NULL) +(2 rows) + +explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 > 1; + QUERY PLAN +------------------------------- + Seq Scan on select_distinct_a + Filter: (uk2 > 1) +(2 rows) + +-- distinct erased due to group by +explain select distinct e from select_distinct_a group by e; + QUERY PLAN +-------------------------------------------------------------------------- + HashAggregate (cost=14.88..16.88 rows=200 width=4) + Group Key: e + -> Seq Scan on select_distinct_a (cost=0.00..13.90 rows=390 width=4) +(3 rows) + +-- distinct erased due to the restirctinfo +explain select distinct uk1 from select_distinct_a where pk1 = 1 and pk2 = 'c'; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Index Scan using select_distinct_a_pkey on select_distinct_a (cost=0.15..8.17 rows=1 width=84) + Index Cond: ((pk1 = 1) AND (pk2 = 'c'::bpchar)) +(2 rows) + +-- test join +set enable_mergejoin to off; +set enable_hashjoin to off; +insert into select_distinct_a values(1, 'a', 'a', 0, 1), (1, 'b', 'A', 0, 2), (3, 'c', 'c', 0, 3); +insert into select_distinct_b values(1, 'a', 'a', 0, 1), (4, 'd', 'd', 0, 4), (1, 'e', 'e', 0, 5); +-- Cartesian join +explain (costs off) select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null; + QUERY PLAN +--------------------------------------------- + Nested Loop + -> Seq Scan on select_distinct_b b + -> Materialize + -> Seq Scan on select_distinct_a a + Filter: (uk2 IS NOT NULL) +(5 rows) + +select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null order by 1, 2, 3, 4; + uk1 | uk2 | pk1 | pk2 +----------------------+-----+----------------------+----- + a | 0 | a | 0 + a | 0 | d | 0 + a | 0 | e | 0 + A | 0 | a | 0 + A | 0 | d | 0 + A | 0 | e | 0 + c | 0 | a | 0 + c | 0 | d | 0 + c | 0 | e | 0 +(9 rows) + +-- left join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a); + QUERY PLAN +----------------------------------------------------------------------------------- + Nested Loop Left Join (cost=0.00..2310.28 rows=760 width=176) + Join Filter: (a.pk1 = b.a) + -> Seq Scan on select_distinct_a a (cost=0.00..13.90 rows=390 width=88) + -> Materialize (cost=0.00..15.85 rows=390 width=92) + -> Seq Scan on select_distinct_b b (cost=0.00..13.90 rows=390 width=92) +(5 rows) + +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;; + pk1 | pk2 | pk1 | pk2 +-----+----------------------+----------------------+----- + 1 | a | a | 0 + 1 | a | e | 0 + 1 | b | a | 0 + 1 | b | e | 0 + 3 | c | | +(5 rows) + +-- right join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a); + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Nested Loop Left Join (cost=0.15..140.88 rows=760 width=176) + -> Seq Scan on select_distinct_b b (cost=0.00..13.90 rows=390 width=92) + -> Index Only Scan using select_distinct_a_pkey on select_distinct_a a (cost=0.15..0.31 rows=2 width=88) + Index Cond: (pk1 = b.a) +(4 rows) + +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4; + pk1 | pk2 | pk1 | pk2 +-----+----------------------+----------------------+----- + 1 | a | a | 0 + 1 | a | e | 0 + 1 | b | a | 0 + 1 | b | e | 0 + | | d | 0 +(5 rows) + +-- full join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a); + QUERY PLAN +----------------------------------------------------------------------------------- + Hash Full Join (cost=10000000018.77..10000000060.26 rows=760 width=176) + Hash Cond: (a.pk1 = b.a) + -> Seq Scan on select_distinct_a a (cost=0.00..13.90 rows=390 width=88) + -> Hash (cost=13.90..13.90 rows=390 width=92) + -> Seq Scan on select_distinct_b b (cost=0.00..13.90 rows=390 width=92) +(5 rows) + +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4; + pk1 | pk2 | pk1 | pk2 +-----+----------------------+----------------------+----- + 1 | a | a | 0 + 1 | a | e | 0 + 1 | b | a | 0 + 1 | b | e | 0 + 3 | c | | + | | d | 0 +(6 rows) + +-- distinct can't be erased since b.pk2 is missed +explain select distinct a.pk1, a.pk2, b.pk1 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a); + QUERY PLAN +----------------------------------------------------------------------------------------------- + Unique (cost=10000000096.63..10000000104.23 rows=760 width=172) + -> Sort (cost=10000000096.63..10000000098.53 rows=760 width=172) + Sort Key: a.pk1, a.pk2, b.pk1 + -> Hash Full Join (cost=10000000018.77..10000000060.26 rows=760 width=172) + Hash Cond: (a.pk1 = b.a) + -> Seq Scan on select_distinct_a a (cost=0.00..13.90 rows=390 width=88) + -> Hash (cost=13.90..13.90 rows=390 width=88) + -> Seq Scan on select_distinct_b b (cost=0.00..13.90 rows=390 width=88) +(8 rows) + +-- Semi/anti join +explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 in (select a from select_distinct_b); + QUERY PLAN +------------------------------------------------------------------------- + Nested Loop + -> HashAggregate + Group Key: select_distinct_b.a + -> Seq Scan on select_distinct_b + -> Index Only Scan using select_distinct_a_pkey on select_distinct_a + Index Cond: (pk1 = select_distinct_b.a) +(6 rows) + +explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 not in (select a from select_distinct_b); + QUERY PLAN +--------------------------------------- + Seq Scan on select_distinct_a + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Seq Scan on select_distinct_b +(4 rows) + +-- we also can handle some limited subquery +explain select distinct * from select_distinct_a a, (select a from select_distinct_b group by a) b where a.pk1 = b.a; + QUERY PLAN +---------------------------------------------------------------------------------------------------------- + Nested Loop (cost=15.02..107.38 rows=390 width=184) + -> HashAggregate (cost=14.88..16.88 rows=200 width=4) + Group Key: select_distinct_b.a + -> Seq Scan on select_distinct_b (cost=0.00..13.90 rows=390 width=4) + -> Index Scan using select_distinct_a_pkey on select_distinct_a a (cost=0.15..0.42 rows=2 width=180) + Index Cond: (pk1 = select_distinct_b.a) +(6 rows) + +select distinct * from select_distinct_a a, (select a from select_distinct_b group by a) b where a.pk1 = b.a order by 1, 2, 3; + pk1 | pk2 | uk1 | uk2 | e | a +-----+----------------------+----------------------+-----+---+--- + 1 | a | a | 0 | 1 | 1 + 1 | b | A | 0 | 2 | 1 +(2 rows) + +explain select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a; + QUERY PLAN +---------------------------------------------------------------------------------------------------------- + Nested Loop (cost=15.02..107.38 rows=390 width=184) + -> HashAggregate (cost=14.88..16.88 rows=200 width=4) + Group Key: select_distinct_b.a + -> Seq Scan on select_distinct_b (cost=0.00..13.90 rows=390 width=4) + -> Index Scan using select_distinct_a_pkey on select_distinct_a a (cost=0.15..0.42 rows=2 width=180) + Index Cond: (pk1 = select_distinct_b.a) +(6 rows) + +select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a order by 1 ,2, 3; + pk1 | pk2 | uk1 | uk2 | e | a +-----+----------------------+----------------------+-----+---+--- + 1 | a | a | 0 | 1 | 1 + 1 | b | A | 0 | 2 | 1 +(2 rows) + +-- Distinct On +-- can't erase since pk2 is missed +explain (costs off) select distinct on(pk1) pk1, pk2 from select_distinct_a; + QUERY PLAN +------------------------------------------- + Unique + -> Sort + Sort Key: pk1 + -> Seq Scan on select_distinct_a +(4 rows) + +-- ok to erase +explain (costs off) select distinct on(pk1, pk2) pk1, pk2 from select_distinct_a; + QUERY PLAN +------------------------------- + Seq Scan on select_distinct_a +(1 row) + +-- test some view. +create view distinct_v1 as select distinct uk1, uk2 from select_distinct_a where uk2 is not null; +explain select * from distinct_v1; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on select_distinct_a (cost=0.00..13.90 rows=388 width=88) + Filter: (uk2 IS NOT NULL) +(2 rows) + +alter table select_distinct_a alter column uk1 drop not null; +explain select * from distinct_v1; + QUERY PLAN +--------------------------------------------------------------------------- + HashAggregate (cost=15.84..17.84 rows=200 width=88) + Group Key: select_distinct_a.uk1, select_distinct_a.uk2 + -> Seq Scan on select_distinct_a (cost=0.00..13.90 rows=388 width=88) + Filter: (uk2 IS NOT NULL) +(4 rows) + +alter table select_distinct_a alter column uk1 set not null; +-- test generic plan +prepare pt as select * from distinct_v1; +explain execute pt; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on select_distinct_a (cost=0.00..13.90 rows=388 width=88) + Filter: (uk2 IS NOT NULL) +(2 rows) + +alter table select_distinct_a alter column uk1 drop not null; +explain execute pt; + QUERY PLAN +--------------------------------------------------------------------------- + HashAggregate (cost=15.84..17.84 rows=200 width=88) + Group Key: select_distinct_a.uk1, select_distinct_a.uk2 + -> Seq Scan on select_distinct_a (cost=0.00..13.90 rows=388 width=88) + Filter: (uk2 IS NOT NULL) +(4 rows) + +drop view distinct_v1; +drop table select_distinct_a; +drop table select_distinct_b; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index a1c90eb905..e053214f9d 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -73,6 +73,7 @@ select name, setting from pg_settings where name like 'enable%'; name | setting --------------------------------+--------- enable_bitmapscan | on + enable_distinct_elimination | on enable_gathermerge | on enable_hashagg | on enable_hashjoin | on @@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(17 rows) +(18 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index a605e86449..813361ad89 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -73,3 +73,87 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no"; SELECT 2 IS NOT DISTINCT FROM 2 as "yes"; SELECT 2 IS NOT DISTINCT FROM null as "no"; SELECT null IS NOT DISTINCT FROM null as "yes"; +create table select_distinct_a(pk1 int, pk2 char(20), uk1 char(20) not null, uk2 int, e int, primary key(pk1, pk2)); +create unique index select_distinct_a_uk on select_distinct_a(uk1, uk2); +create table select_distinct_b(a int, b char(20), pk1 char(20), pk2 int, e int, primary key(pk1, pk2)); + +-- distinct erased since (pk1, pk2) +explain (costs off) select distinct * from select_distinct_a; + +-- distinct can't be reased since since we required all the uk must be not null +explain (costs off) select distinct uk1, uk2 from select_distinct_a; + +-- distinct ereased since uk + not null +explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 is not null; +explain (costs off) select distinct uk1, uk2 from select_distinct_a where uk2 > 1; + +-- distinct erased due to group by +explain select distinct e from select_distinct_a group by e; + +-- distinct erased due to the restirctinfo +explain select distinct uk1 from select_distinct_a where pk1 = 1 and pk2 = 'c'; + +-- test join +set enable_mergejoin to off; +set enable_hashjoin to off; + +insert into select_distinct_a values(1, 'a', 'a', 0, 1), (1, 'b', 'A', 0, 2), (3, 'c', 'c', 0, 3); +insert into select_distinct_b values(1, 'a', 'a', 0, 1), (4, 'd', 'd', 0, 4), (1, 'e', 'e', 0, 5); + +-- Cartesian join +explain (costs off) select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null; +select distinct a.uk1, a.uk2, b.pk1, b.pk2 from select_distinct_a a, select_distinct_b b where a.uk2 is not null order by 1, 2, 3, 4; + + +-- left join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a); +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a left join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4;; + +-- right join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a); +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a right join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4; + +-- full join +explain select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a); +select distinct a.pk1, a.pk2, b.pk1, b.pk2 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a) order by 1, 2, 3, 4; + +-- distinct can't be erased since b.pk2 is missed +explain select distinct a.pk1, a.pk2, b.pk1 from select_distinct_a a full outer join select_distinct_b b on (a.pk1 = b.a); + + +-- Semi/anti join +explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 in (select a from select_distinct_b); +explain (costs off) select distinct pk1, pk2 from select_distinct_a where pk1 not in (select a from select_distinct_b); + +-- we also can handle some limited subquery +explain select distinct * from select_distinct_a a, (select a from select_distinct_b group by a) b where a.pk1 = b.a; +select distinct * from select_distinct_a a, (select a from select_distinct_b group by a) b where a.pk1 = b.a order by 1, 2, 3; + +explain select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a; +select distinct * from select_distinct_a a, (select distinct a from select_distinct_b) b where a.pk1 = b.a order by 1 ,2, 3; + +-- Distinct On +-- can't erase since pk2 is missed +explain (costs off) select distinct on(pk1) pk1, pk2 from select_distinct_a; +-- ok to erase +explain (costs off) select distinct on(pk1, pk2) pk1, pk2 from select_distinct_a; + + +-- test some view. +create view distinct_v1 as select distinct uk1, uk2 from select_distinct_a where uk2 is not null; +explain select * from distinct_v1; + +alter table select_distinct_a alter column uk1 drop not null; +explain select * from distinct_v1; + +alter table select_distinct_a alter column uk1 set not null; + +-- test generic plan +prepare pt as select * from distinct_v1; +explain execute pt; +alter table select_distinct_a alter column uk1 drop not null; +explain execute pt; + +drop view distinct_v1; +drop table select_distinct_a; +drop table select_distinct_b; -- 2.21.0