From ecb14a529bc91a2a806e00f93be6402fef52b879 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B8=80=E6=8C=83?= <yizhi.fzh@alibaba-inc.com>
Date: Wed, 6 May 2020 16:32:28 +0800
Subject: [PATCH v8 5/7] If the group by clause is unique and we have
 aggregation function,

We treat the input as sorted without an explicitly sort since each
group has one 1 row.
---
 src/backend/commands/explain.c           |   4 +
 src/backend/executor/nodeAgg.c           |   6 ++
 src/backend/optimizer/plan/createplan.c  |   3 +-
 src/backend/optimizer/plan/planner.c     | 127 +++++++++++++++--------
 src/include/nodes/nodes.h                |   3 +-
 src/include/nodes/pathnodes.h            |   1 +
 src/include/nodes/plannodes.h            |   1 +
 src/test/regress/expected/aggregates.out |  42 ++++++++
 src/test/regress/sql/aggregates.sql      |  17 +++
 9 files changed, 159 insertions(+), 45 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 5695802081..a7b38cfc08 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1929,6 +1929,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			show_agg_keys(castNode(AggState, planstate), ancestors, es);
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
 			show_hashagg_info((AggState *) planstate, es);
+			if (es->format != EXPLAIN_FORMAT_TEXT ||
+				(es->verbose && ((Agg *) plan)->input_unique))
+				ExplainPropertyBool("Input Unique",
+									((Agg *) plan)->input_unique, es);
 			if (plan->qual)
 				show_instrumentation_count("Rows Removed by Filter", 1,
 										   planstate, es);
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 9f4229de60..2d5493c744 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -2145,6 +2145,12 @@ ExecAgg(PlanState *pstate)
 			case AGG_SORTED:
 				result = agg_retrieve_direct(node);
 				break;
+			case AGG_UNIQUE:
+				/* AGG_UNIQUE is translated to AGG_SORTED, Handle it here
+				 * to make compiler quiet.
+				 */
+				Assert(false);
+				break;
 		}
 
 		if (!TupIsNull(result))
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 9941dfe65e..0049d22227 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -6353,7 +6353,7 @@ make_agg(List *tlist, List *qual,
 	/* Reduce to long, but 'ware overflow! */
 	numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
 
-	node->aggstrategy = aggstrategy;
+	node->aggstrategy = aggstrategy == AGG_UNIQUE ? AGG_SORTED : aggstrategy;
 	node->aggsplit = aggsplit;
 	node->numCols = numGroupCols;
 	node->grpColIdx = grpColIdx;
@@ -6364,6 +6364,7 @@ make_agg(List *tlist, List *qual,
 	node->aggParams = NULL;		/* SS_finalize_plan() will fill this */
 	node->groupingSets = groupingSets;
 	node->chain = chain;
+	node->input_unique = aggstrategy == AGG_UNIQUE;
 
 	plan->qual = qual;
 	plan->targetlist = tlist;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 74dcd0ff3d..1e72411d51 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3852,51 +3852,61 @@ create_grouping_paths(PlannerInfo *root,
 		int			flags = 0;
 		GroupPathExtraData extra;
 
-		/*
-		 * Determine whether it's possible to perform sort-based
-		 * implementations of grouping.  (Note that if groupClause is empty,
-		 * grouping_is_sortable() is trivially true, and all the
-		 * pathkeys_contained_in() tests will succeed too, so that we'll
-		 * consider every surviving input path.)
-		 *
-		 * If we have grouping sets, we might be able to sort some but not all
-		 * of them; in this case, we need can_sort to be true as long as we
-		 * must consider any sorted-input plan.
-		 */
-		if ((gd && gd->rollups != NIL)
-			|| grouping_is_sortable(parse->groupClause))
-			flags |= GROUPING_CAN_USE_SORT;
+		if (group_unique_input)
+		{
+			/* In this case we don't need to set other flags */
+			Assert(parse->groupClause != NIL);
+			Assert(gd == NULL);
+			flags |= GROUPING_INPUT_UNIQUE;
+		}
+		else
+		{
+			/*
+			 * Determine whether it's possible to perform sort-based
+			 * implementations of grouping.  (Note that if groupClause is empty,
+			 * grouping_is_sortable() is trivially true, and all the
+			 * pathkeys_contained_in() tests will succeed too, so that we'll
+			 * consider every surviving input path.)
+			 *
+			 * If we have grouping sets, we might be able to sort some but not all
+			 * of them; in this case, we need can_sort to be true as long as we
+			 * must consider any sorted-input plan.
+			 */
+			if ((gd && gd->rollups != NIL)
+				|| grouping_is_sortable(parse->groupClause))
+				flags |= GROUPING_CAN_USE_SORT;
 
-		/*
-		 * Determine whether we should consider hash-based implementations of
-		 * grouping.
-		 *
-		 * Hashed aggregation only applies if we're grouping. If we have
-		 * grouping sets, some groups might be hashable but others not; in
-		 * this case we set can_hash true as long as there is nothing globally
-		 * preventing us from hashing (and we should therefore consider plans
-		 * with hashes).
-		 *
-		 * Executor doesn't support hashed aggregation with DISTINCT or ORDER
-		 * BY aggregates.  (Doing so would imply storing *all* the input
-		 * values in the hash table, and/or running many sorts in parallel,
-		 * either of which seems like a certain loser.)  We similarly don't
-		 * support ordered-set aggregates in hashed aggregation, but that case
-		 * is also included in the numOrderedAggs count.
-		 *
-		 * Note: grouping_is_hashable() is much more expensive to check than
-		 * the other gating conditions, so we want to do it last.
-		 */
-		if ((parse->groupClause != NIL &&
-			 agg_costs->numOrderedAggs == 0 &&
-			 (gd ? gd->any_hashable : grouping_is_hashable(parse->groupClause))))
-			flags |= GROUPING_CAN_USE_HASH;
+			/*
+			 * Determine whether we should consider hash-based implementations of
+			 * grouping.
+			 *
+			 * Hashed aggregation only applies if we're grouping. If we have
+			 * grouping sets, some groups might be hashable but others not; in
+			 * this case we set can_hash true as long as there is nothing globally
+			 * preventing us from hashing (and we should therefore consider plans
+			 * with hashes).
+			 *
+			 * Executor doesn't support hashed aggregation with DISTINCT or ORDER
+			 * BY aggregates.  (Doing so would imply storing *all* the input
+			 * values in the hash table, and/or running many sorts in parallel,
+			 * either of which seems like a certain loser.)  We similarly don't
+			 * support ordered-set aggregates in hashed aggregation, but that case
+			 * is also included in the numOrderedAggs count.
+			 *
+			 * Note: grouping_is_hashable() is much more expensive to check than
+			 * the other gating conditions, so we want to do it last.
+			 */
+			if ((parse->groupClause != NIL &&
+				 agg_costs->numOrderedAggs == 0 &&
+				 (gd ? gd->any_hashable : grouping_is_hashable(parse->groupClause))))
+				flags |= GROUPING_CAN_USE_HASH;
 
-		/*
-		 * Determine whether partial aggregation is possible.
-		 */
-		if (can_partial_agg(root, agg_costs))
-			flags |= GROUPING_CAN_PARTIAL_AGG;
+			/*
+			 * Determine whether partial aggregation is possible.
+			 */
+			if (can_partial_agg(root, agg_costs))
+				flags |= GROUPING_CAN_PARTIAL_AGG;
+		}
 
 		extra.flags = flags;
 		extra.target_parallel_safe = target_parallel_safe;
@@ -6524,9 +6534,40 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 	ListCell   *lc;
 	bool		can_hash = (extra->flags & GROUPING_CAN_USE_HASH) != 0;
 	bool		can_sort = (extra->flags & GROUPING_CAN_USE_SORT) != 0;
+	bool		group_input_unique = (extra->flags & GROUPING_INPUT_UNIQUE) != 0;
 	List	   *havingQual = (List *) extra->havingQual;
 	AggClauseCosts *agg_final_costs = &extra->agg_final_costs;
 
+	if (group_input_unique)
+	{
+		Path *path = input_rel->cheapest_total_path;
+		add_path(grouped_rel, (Path *) create_agg_path(root,
+													   grouped_rel,
+													   path,
+													   grouped_rel->reltarget,
+													   AGG_UNIQUE,
+													   AGGSPLIT_SIMPLE,
+													   parse->groupClause,
+													   havingQual,
+													   agg_costs,
+													   dNumGroups));
+
+		if (path != input_rel->cheapest_startup_path)
+		{
+			path = input_rel->cheapest_startup_path;
+			add_path(grouped_rel, (Path *) create_agg_path(root,
+														   grouped_rel,
+														   path,
+														   grouped_rel->reltarget,
+														   AGG_UNIQUE,
+														   AGGSPLIT_SIMPLE,
+														   parse->groupClause,
+														   havingQual,
+														   agg_costs,
+														   dNumGroups));
+		}
+		return;
+	}
 	if (can_sort)
 	{
 		/*
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 41110ed888..010266ed4b 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -761,7 +761,8 @@ typedef enum AggStrategy
 	AGG_PLAIN,					/* simple agg across all input rows */
 	AGG_SORTED,					/* grouped agg, input must be sorted */
 	AGG_HASHED,					/* grouped agg, use internal hashtable */
-	AGG_MIXED					/* grouped agg, hash and sort both used */
+	AGG_MIXED,					/* grouped agg, hash and sort both used */
+	AGG_UNIQUE					/* grouped agg, the group clause is unique */
 } AggStrategy;
 
 /*
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 575353d86c..9f0cfaf094 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -2475,6 +2475,7 @@ typedef struct JoinPathExtraData
 #define GROUPING_CAN_USE_SORT       0x0001
 #define GROUPING_CAN_USE_HASH       0x0002
 #define GROUPING_CAN_PARTIAL_AGG	0x0004
+#define GROUPING_INPUT_UNIQUE		0x0008
 
 /*
  * What kind of partitionwise aggregation is in use?
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 55f363f70c..e8300d9f37 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -828,6 +828,7 @@ typedef struct Agg
 	/* Note: planner provides numGroups & aggParams only in HASHED/MIXED case */
 	List	   *groupingSets;	/* grouping sets to use */
 	List	   *chain;			/* chained Agg/Sort nodes */
+	bool		input_unique;   /* The input is unique already */
 } Agg;
 
 /* ----------------
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 42bd180895..65912142f8 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2544,6 +2544,46 @@ set work_mem to default;
 ----+----+----
 (0 rows)
 
+create table agg_unique_1(pk int primary key,  b int);
+create table agg_unique_2(a int, unsortable_col xid);
+insert into agg_unique_2 values(1, '1'), (2, '2'), (2, '1');
+explain (costs off, verbose)  select pk, sum(b) from agg_unique_1
+group by pk;
+              QUERY PLAN               
+---------------------------------------
+ GroupAggregate
+   Output: pk, sum(b)
+   Group Key: agg_unique_1.pk
+   Input Unique: true
+   ->  Seq Scan on public.agg_unique_1
+         Output: pk, b
+(6 rows)
+
+explain (costs off, verbose) select unsortable_col, count(*)
+from (select distinct unsortable_col from agg_unique_2) t
+group by unsortable_col;
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ GroupAggregate
+   Output: agg_unique_2.unsortable_col, count(*)
+   Group Key: agg_unique_2.unsortable_col
+   Input Unique: true
+   ->  HashAggregate
+         Output: agg_unique_2.unsortable_col
+         Group Key: agg_unique_2.unsortable_col
+         ->  Seq Scan on public.agg_unique_2
+               Output: agg_unique_2.a, agg_unique_2.unsortable_col
+(9 rows)
+
+select unsortable_col, count(*)
+from (select distinct unsortable_col from agg_unique_2) t
+group by unsortable_col;
+ unsortable_col | count 
+----------------+-------
+              2 |     1
+              1 |     1
+(2 rows)
+
 drop table agg_group_1;
 drop table agg_group_2;
 drop table agg_group_3;
@@ -2552,3 +2592,5 @@ drop table agg_hash_1;
 drop table agg_hash_2;
 drop table agg_hash_3;
 drop table agg_hash_4;
+drop table agg_unique_1;
+drop table agg_unique_2;
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 3446c3e9fd..3dc40b13d5 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -1159,6 +1159,21 @@ set work_mem to default;
   union all
 (select * from agg_group_4 except select * from agg_hash_4);
 
+create table agg_unique_1(pk int primary key,  b int);
+create table agg_unique_2(a int, unsortable_col xid);
+insert into agg_unique_2 values(1, '1'), (2, '2'), (2, '1');
+
+explain (costs off, verbose)  select pk, sum(b) from agg_unique_1
+group by pk;
+
+explain (costs off, verbose) select unsortable_col, count(*)
+from (select distinct unsortable_col from agg_unique_2) t
+group by unsortable_col;
+
+select unsortable_col, count(*)
+from (select distinct unsortable_col from agg_unique_2) t
+group by unsortable_col;
+
 drop table agg_group_1;
 drop table agg_group_2;
 drop table agg_group_3;
@@ -1167,3 +1182,5 @@ drop table agg_hash_1;
 drop table agg_hash_2;
 drop table agg_hash_3;
 drop table agg_hash_4;
+drop table agg_unique_1;
+drop table agg_unique_2;
-- 
2.21.0