From 59d6fa8fd1dbccd60cf4a72663157fc531dbd5f9 Mon Sep 17 00:00:00 2001
From: erthalion <9erthalion6@gmail.com>
Date: Fri, 29 Mar 2019 15:20:22 +0100
Subject: [PATCH v12 3/3] Reorder by values distribution
---
src/backend/optimizer/path/pathkeys.c | 218 +++++++++++++++++++++++++++++++
src/backend/optimizer/plan/planner.c | 33 ++++-
src/backend/utils/misc/guc.c | 21 ++-
src/include/optimizer/paths.h | 10 ++
src/test/regress/expected/aggregates.out | 16 +--
5 files changed, 288 insertions(+), 10 deletions(-)
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 65e53ef854..ac79bc4975 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -24,6 +24,7 @@
#include "nodes/plannodes.h"
#include "optimizer/optimizer.h"
#include "optimizer/pathnode.h"
+#include "optimizer/plancat.h"
#include "optimizer/paths.h"
#include "partitioning/partbounds.h"
#include "utils/lsyscache.h"
@@ -332,6 +333,11 @@ pathkeys_contained_in(List *keys1, List *keys2)
return false;
}
+/*************************************************************/
+bool debug_group_by_reorder_by_pathkeys = true;
+bool debug_cheapest_group_by = true;
+/************************************************************/
+
/*
* Reorder GROUP BY pathkeys and clauses to match order of pathkeys. Function
* returns new lists, original GROUP BY lists stay untouched.
@@ -345,6 +351,9 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
ListCell *key;
int n;
+ if (debug_group_by_reorder_by_pathkeys == false)
+ return 0;
+
if (pathkeys == NIL || *group_pathkeys == NIL)
return 0;
@@ -384,6 +393,215 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
return n;
}
+/*
+ * get_width_multiplier
+ *
+ * Returns relative complexity of comparing two values based on their width.
+ * The idea behind is that long values have more expensive comparison.
+ */
+static double
+get_width_multiplier(PlannerInfo *root, Expr *expr)
+{
+ double width = -1.0;
+
+ if (IsA(expr, RelabelType))
+ expr = (Expr *) ((RelabelType *) expr)->arg;
+
+ /* Try to find actual stat in corresonding relation */
+ if (IsA(expr, Var))
+ {
+ Var *var = (Var *) expr;
+
+ if (var->varno > 0 && var->varno < root->simple_rel_array_size)
+ {
+ RelOptInfo *rel = root->simple_rel_array[var->varno];
+
+ if (rel != NULL &&
+ var->varattno >= rel->min_attr &&
+ var->varattno <= rel->max_attr)
+ {
+ int ndx = var->varattno - rel->min_attr;
+
+ if (rel->attr_widths[ndx] > 0)
+ width = rel->attr_widths[ndx];
+ }
+ }
+ }
+
+ /* Didn't find any actual stats, use estimation by type */
+ if (width < 0.0)
+ {
+ Node *node = (Node*) expr;
+
+ width = get_typavgwidth(exprType(node), exprTypmod(node));
+ }
+
+ /*
+ * Any value in pgsql is passed by Datum type, so any operation with value
+ * could not be cheaper than operation with Datum type
+ */
+ if (width <= sizeof(Datum))
+ return sizeof(Datum);
+
+ return width;
+}
+
+/*
+ * Order tail of list of group pathkeys by uniqueness descendetly. It allows to
+ * speedup sorting. Returns newly allocated lists, old ones stay untouched.
+ * n_preordered defines a head of list which order should be prevented.
+ */
+void
+get_cheapest_group_keys_order(PlannerInfo *root, double nrows,
+ List *target_list,
+ List **group_pathkeys, List **group_clauses,
+ int n_preordered)
+{
+ struct
+ {
+ PathKey *pathkey;
+ SortGroupClause *sgc;
+ Node *pathkeyExpr;
+ }
+ *keys, tmp;
+ int nkeys = list_length(*group_pathkeys) - n_preordered;
+ List *pathkeyExprList = NIL,
+ *new_group_pathkeys = NIL,
+ *new_group_clauses = NIL;
+ ListCell *cell;
+ int i = 0, n_keys_to_est;
+
+ if (!debug_cheapest_group_by)
+ return;
+
+ if (nkeys < 2)
+ return; /* nothing to do */
+
+ /*
+ * Nothing to do here, since reordering of group clauses to match ORDER BY
+ * already performed in preprocess_groupclause
+ */
+ if (n_preordered == 0 && root->sort_pathkeys)
+ return;
+
+ keys = palloc(nkeys * sizeof(*keys));
+
+ /*
+ * Collect information about pathkey for subsequent usage
+ */
+ for_each_cell(cell, list_nth_cell(*group_pathkeys, n_preordered))
+ {
+ PathKey *pathkey = (PathKey *) lfirst(cell);
+
+ keys[i].pathkey = pathkey;
+ keys[i].sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+ *group_clauses);
+ keys[i].pathkeyExpr = get_sortgroupclause_expr(keys[i].sgc,
+ target_list);
+ i++;
+ }
+
+ /*
+ * Find the cheapest to sort order of columns. We will find a first column
+ * with bigger number of group, then pair (first column in pair is already
+ * defined in first step), them triple and so on.
+ */
+ for(n_keys_to_est = 1; n_keys_to_est <= nkeys - 1; n_keys_to_est++)
+ {
+ ListCell *tail_cell;
+ int best_i = 0;
+ double best_est_num_groups = -1;
+
+ /* expand list of columns and remeber last cell */
+ pathkeyExprList = lappend(pathkeyExprList, NULL);
+ tail_cell = list_tail(pathkeyExprList);
+
+ /*
+ * Find the best last column - the best means bigger number of groups,
+ * previous columns are already choosen
+ */
+ for(i = n_keys_to_est - 1; i < nkeys; i++)
+ {
+ double est_num_groups;
+ Expr *expr = (Expr *) keys[i].pathkeyExpr;
+ PathKey *pathkey = keys[i].pathkey;
+ EquivalenceMember *em = (EquivalenceMember *)
+ linitial(pathkey->pk_eclass->ec_members);
+
+ lfirst(tail_cell) = keys[i].pathkeyExpr;
+ est_num_groups = estimate_num_groups(root, pathkeyExprList,
+ nrows, NULL);
+ est_num_groups /= get_width_multiplier(root, expr);
+
+ if (em->em_datatype != InvalidOid)
+ {
+ Oid sortop;
+ QualCost costs;
+ costs.startup = costs.per_tuple = 0;
+
+ sortop = get_opfamily_member(pathkey->pk_opfamily,
+ em->em_datatype, em->em_datatype,
+ pathkey->pk_strategy);
+ add_function_cost(root, get_opcode(sortop), NULL, &costs);
+
+ est_num_groups /= costs.per_tuple;
+ }
+
+ if (est_num_groups > best_est_num_groups)
+ {
+ best_est_num_groups = est_num_groups;
+ best_i = i;
+ }
+ }
+
+ /* Save the best choice */
+ lfirst(tail_cell) = keys[best_i].pathkeyExpr;
+ if (best_i != n_keys_to_est - 1)
+ {
+ tmp = keys[n_keys_to_est - 1];
+ keys[n_keys_to_est - 1] = keys[best_i];
+ keys[best_i] = tmp;
+ }
+ }
+ list_free(pathkeyExprList);
+
+ /*
+ * Construct result lists, keys array is already ordered to get a cheapest
+ * sort
+ */
+ i = 0;
+ foreach(cell, *group_pathkeys)
+ {
+ PathKey *pathkey;
+ SortGroupClause *sgc;
+
+ if (i < n_preordered)
+ {
+ pathkey = (PathKey *) lfirst(cell);
+ sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+ *group_clauses);
+ }
+ else
+ {
+ pathkey = keys[i - n_preordered].pathkey;
+ sgc = keys[i - n_preordered].sgc;
+ }
+
+ new_group_pathkeys = lappend(new_group_pathkeys, pathkey);
+ new_group_clauses = lappend(new_group_clauses, sgc);
+
+ i++;
+ }
+
+ pfree(keys);
+
+ /* Just append the rest GROUP BY clauses */
+ new_group_clauses = list_concat_unique_ptr(new_group_clauses, *group_clauses);
+
+ *group_pathkeys = new_group_pathkeys;
+ *group_clauses = new_group_clauses;
+}
+
/*
* get_cheapest_path_for_pathkeys
* Find the cheapest path (according to the specified criterion) that
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 247e39d6ff..d480551d3c 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -6389,7 +6389,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
bool is_sorted;
List *group_pathkeys = root->group_pathkeys,
*group_clauses = parse->groupClause;
- int n_preordered_groups;
+ int n_preordered_groups = 0;
if (parse->groupingSets)
{
@@ -6413,11 +6413,20 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
{
/* Sort the cheapest-total path if it isn't already sorted */
if (!is_sorted)
+ {
+ if (!parse->groupingSets)
+ get_cheapest_group_keys_order(root,
+ path->rows,
+ extra->targetList,
+ &group_pathkeys,
+ &group_clauses,
+ n_preordered_groups);
path = (Path *) create_sort_path(root,
grouped_rel,
path,
group_pathkeys,
-1.0);
+ }
/* Now decide what to stick atop it */
if (parse->groupingSets)
@@ -6491,6 +6500,12 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
{
if (path != partially_grouped_rel->cheapest_total_path)
continue;
+ get_cheapest_group_keys_order(root,
+ path->rows,
+ extra->targetList,
+ &group_pathkeys,
+ &group_clauses,
+ n_preordered_groups);
path = (Path *) create_sort_path(root,
grouped_rel,
path,
@@ -6765,11 +6780,19 @@ create_partial_grouping_paths(PlannerInfo *root,
{
/* Sort the cheapest partial path, if it isn't already */
if (!is_sorted)
+ {
+ get_cheapest_group_keys_order(root,
+ path->rows,
+ extra->targetList,
+ &group_pathkeys,
+ &group_clauses,
+ n_preordered_groups);
path = (Path *) create_sort_path(root,
partially_grouped_rel,
path,
group_pathkeys,
-1.0);
+ }
if (parse->hasAggs)
add_path(partially_grouped_rel, (Path *)
@@ -6816,11 +6839,19 @@ create_partial_grouping_paths(PlannerInfo *root,
/* Sort the cheapest partial path, if it isn't already */
if (!is_sorted)
+ {
+ get_cheapest_group_keys_order(root,
+ path->rows,
+ extra->targetList,
+ &group_pathkeys,
+ &group_clauses,
+ n_preordered_groups);
path = (Path *) create_sort_path(root,
partially_grouped_rel,
path,
group_pathkeys,
-1.0);
+ }
if (parse->hasAggs)
add_partial_path(partially_grouped_rel, (Path *)
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f7f726b5ae..f13ea2ab5d 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1952,7 +1952,26 @@ static struct config_bool ConfigureNamesBool[] =
false,
NULL, NULL, NULL
},
-
+/*********************************************************/
+ {
+ {"debug_group_by_reorder_by_pathkeys", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("enable reorder GROUP BY by pathkeys"),
+ NULL
+ },
+ &debug_group_by_reorder_by_pathkeys,
+ true,
+ NULL, NULL, NULL
+ },
+ {
+ {"debug_enable_cheapest_group_by", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("find a cheapest order of columns in GROUP BY."),
+ NULL
+ },
+ &debug_cheapest_group_by,
+ true,
+ NULL, NULL, NULL
+ },
+/********************************************************/
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index faf6449f4d..fe4c067369 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -186,6 +186,16 @@ extern bool pathkeys_contained_in(List *keys1, List *keys2);
extern int group_keys_reorder_by_pathkeys(List *pathkeys,
List **group_pathkeys,
List **group_clauses);
+/*********************************************************/
+extern bool debug_group_by_reorder_by_pathkeys;
+extern bool debug_cheapest_group_by;
+/********************************************************/
+extern void get_cheapest_group_keys_order(PlannerInfo *root,
+ double nrows,
+ List *target_list,
+ List **group_pathkeys,
+ List **group_clauses,
+ int n_preordered);
extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
Relids required_outer,
CostSelector cost_criterion,
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 265c996d5e..9afc1a827d 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2228,9 +2228,9 @@ SELECT count(*) FROM btg GROUP BY v, p;
QUERY PLAN
-----------------------------
GroupAggregate
- Group Key: v, p
+ Group Key: p, v
-> Sort
- Sort Key: v, p
+ Sort Key: p, v
-> Seq Scan on btg
(5 rows)
@@ -2239,9 +2239,9 @@ SELECT count(*) FROM btg GROUP BY v, p, c;
QUERY PLAN
-----------------------------
GroupAggregate
- Group Key: v, p, c
+ Group Key: p, c, v
-> Sort
- Sort Key: v, p, c
+ Sort Key: p, c, v
-> Seq Scan on btg
(5 rows)
@@ -2261,9 +2261,9 @@ SELECT count(*) FROM btg GROUP BY v, p, d, c;
QUERY PLAN
------------------------------
GroupAggregate
- Group Key: v, p, d, c
+ Group Key: p, d, c, v
-> Sort
- Sort Key: v, p, d, c
+ Sort Key: p, d, c, v
-> Seq Scan on btg
(5 rows)
@@ -2318,9 +2318,9 @@ SELECT count(*) FROM btg GROUP BY p, d, e;
QUERY PLAN
-----------------------------
GroupAggregate
- Group Key: p, d, e
+ Group Key: p, e, d
-> Sort
- Sort Key: p, d, e
+ Sort Key: p, e, d
-> Seq Scan on btg
(5 rows)
--
2.16.4