From b9ceb880a2377ae2570a3fb2aa4f862d5b8846fd Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Tue, 7 Feb 2017 16:04:03 +0530 Subject: [PATCH 08/11] Partition-wise join Implement partition-wise join for join between single level partitioned tables. The details of this technique can be found in optimizer/README, where most of the implementation has been explained. We obtain clauses applicable to a child-join by translating corresponding clauses of the parent. Because child-join can be computed by different combinations of joining child relations, a given clause is required to be translated multiple times. In order to reduce the memory consumption, we keep a repository of child-clauses derived from a parent clause and search in that repository before translating. Tests for semi-join, those forcing a merge join for child-join and those testing lateral join will crash with this patch. The tests testing joins with partition pruning will also fail. Those crashes and failures are because the existing code does not expect a child-join to appear in certain cases. Following patches will fix that code. --- src/backend/nodes/copyfuncs.c | 9 + src/backend/optimizer/README | 53 ++++ src/backend/optimizer/path/allpaths.c | 305 ++++++++++++++++--- src/backend/optimizer/path/costsize.c | 3 + src/backend/optimizer/path/joinpath.c | 21 +- src/backend/optimizer/path/joinrels.c | 421 ++++++++++++++++++++++++++ src/backend/optimizer/plan/createplan.c | 227 +++++++++++++- src/backend/optimizer/prep/prepunion.c | 160 ++++++++++ src/backend/optimizer/util/pathnode.c | 113 +++++++ src/backend/optimizer/util/placeholder.c | 55 ++++ src/backend/optimizer/util/relnode.c | 266 +++++++++++++++- src/backend/utils/misc/guc.c | 28 ++ src/include/nodes/nodes.h | 1 + src/include/nodes/relation.h | 57 ++++ src/include/optimizer/cost.h | 5 + src/include/optimizer/pathnode.h | 6 + src/include/optimizer/paths.h | 5 + src/include/optimizer/placeholder.h | 2 + src/include/optimizer/prep.h | 8 + src/test/regress/expected/partition_join.out | 4 + src/test/regress/expected/sysviews.out | 29 +- src/test/regress/sql/partition_join.sql | 5 + 22 files changed, 1713 insertions(+), 70 deletions(-) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 30d733e..72c021e 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -2070,6 +2070,15 @@ _copyRestrictInfo(const RestrictInfo *from) COPY_SCALAR_FIELD(left_bucketsize); COPY_SCALAR_FIELD(right_bucketsize); + /* + * Do not copy parent_rinfo and child_rinfos because 1. they create a + * circular dependency between child and parent RestrictInfo 2. dropping + * those links just means that we loose some memory optimizations. 3. There + * is a possibility that the child and parent RestrictInfots themselves may + * have got copied and thus the old links may no longer be valid. The + * caller may set up those links itself, if needed. + */ + return newnode; } diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README index fc0fca4..7565ae4 100644 --- a/src/backend/optimizer/README +++ b/src/backend/optimizer/README @@ -1076,3 +1076,56 @@ be desirable to postpone the Gather stage until as near to the top of the plan as possible. Expanding the range of cases in which more work can be pushed below the Gather (and costing them accurately) is likely to keep us busy for a long time to come. + +Partition-wise joins +-------------------- +A join between two similarly partitioned tables can be broken down into joins +between their matching partitions if there exists an equi-join condition +between the partition keys of the joining tables. The equi-join between +partition keys implies that for a given row in a given partition of a given +partitioned table, its joining row, if exists, should exist only in the +matching partition of the other partitioned table; no row from non-matching +partitions in the other partitioned table can join with the given row from the +first table. This condition allows the join between partitioned table to be +broken into joins between the matching partitions. The resultant join is +partitioned in the same way as the joining relations, thus allowing an N-way +join between similarly partitioned tables having equi-join condition between +their partition keys to be broken down into N-way joins between their matching +partitions. This technique of breaking down a join between partition tables +into join between their partitions is called partition-wise join. We will use +term "partitioned relation" for both partitioned table as well as join between +partitioned tables which can use partition-wise join technique. + +Partitioning properties of a partitioned table are stored in +PartitionSchemeData structure. Planner maintains a list of canonical partition +schemes (distinct PartitionSchemeData objects) so that any two partitioned +relations with same partitioning scheme share the same PartitionSchemeData +object. This reduces memory consumed by PartitionSchemeData objects and makes +it easy to compare the partition schemes of joining relations. RelOptInfos of +partitioned relations hold partition key expressions and the RelOptInfos of +the partition relations of that relation. + +Partition-wise joins are planned in two phases + +1. First phase creates the RelOptInfos for joins between matching partitions, +henceforth referred to as child-joins. The number of paths created for a +child-join i.e. join between partitions is same as the number of paths created +for join between parents. That number grows exponentially with the number of +base relations being joined. The time and memory consumed to create paths for +each child-join will be proporional to the number of partitions. This will not +scale well with thousands of partitions. Instead of that we estimate +partition-wise join cost based on the costs of sampled child-joins. We choose +child-joins with higher sizes to have realistic estimates. If the number of +sampled child-joins is same as the number of live child-joins, we create append +paths as we know costs of all required child-joins. Otherwise we create +PartitionJoinPaths with cost estimates based on the costs of sampled +child-joins. While creating append paths or PartitionJoin paths we create paths +for all the different possible parameterizations and pathkeys available in the +sampled child-joins. + +2. If PartitionJoinPath emerges as the best possible path, we create paths for +each unsampled child-join. From every child-join we choose the cheapest path +with same parameterization or pathkeys as the PartitionJoinPath. This path is +converted into a plan and all the child-join plans are combined using an Append +or MergeAppend plan as appropriate. We use a fresh memory context for planning +each unsampled child-join, thus reducing memory consumption. diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 0eb56f3..1adf6ba 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -15,6 +15,7 @@ #include "postgres.h" +#include "miscadmin.h" #include #include @@ -93,8 +94,8 @@ static void set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, static void set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); static void generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, - List *live_childrels, - List *all_child_pathkeys); + List *live_childrels, List *all_child_pathkeys, + bool partition_join_path); static Path *get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer); @@ -128,8 +129,8 @@ static void recurse_push_qual(Node *setOp, Query *topquery, static void remove_unused_subquery_outputs(Query *subquery, RelOptInfo *rel); static int compute_parallel_worker(RelOptInfo *rel, BlockNumber pages); static void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, - List *live_childrels); - + List *live_childrels, bool partition_join_path); +static int compare_rel_size(const void *rel1_p, const void *rel2_p); /* * make_one_rel @@ -891,6 +892,12 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, childrel = find_base_rel(root, childRTindex); Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL); + /* Pass top parent's relids down the inheritance hierarchy. */ + if (rel->top_parent_relids) + childrel->top_parent_relids = rel->top_parent_relids; + else + childrel->top_parent_relids = bms_copy(rel->relids); + /* * Two partitioned tables with the same partitioning scheme, have their * partition bounds arranged in the same order. The order of partition @@ -900,10 +907,15 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, * RelOptInfos. Arranging RelOptInfos of partitions in the same order * as their OIDs makes it easy to find the RelOptInfos of matching * partitions for partition-wise join. + * + * For a partitioned tables, individual partitions can participate in + * the pair-wise joins. We need attr_needed data for building + * child-join targetlists. */ if (rel->part_scheme) { int cnt_parts; + AttrNumber attno; for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++) { @@ -913,6 +925,38 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, rel->part_rels[cnt_parts] = childrel; } } + + for (attno = rel->min_attr; attno <= rel->max_attr; attno++) + { + int index = attno - rel->min_attr; + Relids attr_needed = bms_copy(rel->attr_needed[index]); + + /* + * System attributes do not need translation. In such a case, + * the attribute numbers of the parent and the child should + * start from the same minimum attribute. + */ + if (attno <= 0) + { + Assert(rel->min_attr == childrel->min_attr); + childrel->attr_needed[index] = attr_needed; + } + else + { + Var *var = list_nth(appinfo->translated_vars, + attno - 1); + int child_index; + + /* + * Parent Var for a user defined attribute translates to + * child Var. + */ + Assert(IsA(var, Var)); + + child_index = var->varattno - childrel->min_attr; + childrel->attr_needed[child_index] = attr_needed; + } + } } /* @@ -1057,10 +1101,8 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, * PlaceHolderVars.) XXX we do not bother to update the cost or width * fields of childrel->reltarget; not clear if that would be useful. */ - childrel->joininfo = (List *) - adjust_appendrel_attrs(root, - (Node *) rel->joininfo, - appinfo_list); + childrel->joininfo = build_child_clauses(root, rel->joininfo, + appinfo_list); childrel->reltarget->exprs = (List *) adjust_appendrel_attrs(root, (Node *) rel->reltarget->exprs, @@ -1079,14 +1121,6 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, childrel->has_eclass_joins = rel->has_eclass_joins; /* - * Note: we could compute appropriate attr_needed data for the child's - * variables, by transforming the parent's attr_needed through the - * translated_vars mapping. However, currently there's no need - * because attr_needed is only examined for base relations not - * otherrels. So we just leave the child's attr_needed empty. - */ - - /* * If parallelism is allowable for this query in general, see whether * it's allowable for this childrel in particular. But if we've * already decided the appendrel is not parallel-safe as a whole, @@ -1269,10 +1303,9 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, } /* Add Append/MergeAppend paths to the "append" relation. */ - add_paths_to_append_rel(root, rel, live_childrels); + add_paths_to_append_rel(root, rel, live_childrels, false); } - /* * add_paths_to_append_rel * Generate Append/MergeAppend paths for given "append" relation. @@ -1282,20 +1315,44 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, * an append path collecting one path from each non-dummy child with given * parameterization or ordering. Similarly it collects partial paths from * non-dummy children to create partial append paths. + * + * When called on partitioned join relation with partition_join_path = true, it + * adds PartitionJoinPath instead of Merge/Append path. This path is costed + * based on the costs of sampled child-join and is expanded later into + * Merge/Append plan. */ static void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, - List *live_childrels) + List *live_childrels, bool partition_join_path) { List *subpaths = NIL; bool subpaths_valid = true; List *partial_subpaths = NIL; - bool partial_subpaths_valid = true; + bool partial_subpaths_valid; List *all_child_pathkeys = NIL; List *all_child_outers = NIL; ListCell *l; /* + * While creating PartitionJoinPath, we sample paths from only a few child + * relations. Even if all of sampled children have partial paths, it's not + * guaranteed that all the unsampled children will have partial paths. + * Hence we do not create partial PartitionJoinPaths. + */ + partial_subpaths_valid = !partition_join_path ? true : false; + + /* An append relation with all its children dummy is dummy. */ + if (live_childrels == NIL) + { + /* Mark the relation as dummy, if not already done so. */ + if (!IS_DUMMY_REL(rel)) + set_dummy_rel_pathlist(rel); + + /* No more paths need to be added. */ + return; + } + + /* * For every non-dummy child, remember the cheapest path. Also, identify * all pathkeys (orderings) and parameterizations (required_outer sets) * available for the non-dummy member relations. @@ -1394,7 +1451,17 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * if we have zero or one live subpath due to constraint exclusion.) */ if (subpaths_valid) - add_path(rel, (Path *) create_append_path(rel, subpaths, NULL, 0)); + { + Path *path; + + if (partition_join_path) + path = (Path *) create_partition_join_path(root, rel, subpaths, + NULL, NIL); + else + path = (Path *) create_append_path(rel, subpaths, NULL, 0); + + add_path(rel, path); + } /* * Consider an append of partial unordered, unparameterized partial paths. @@ -1405,6 +1472,8 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, ListCell *lc; int parallel_workers = 0; + Assert(!partition_join_path); + /* * Decide on the number of workers to request for this append path. * For now, we just use the maximum value from among the members. It @@ -1431,7 +1500,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, */ if (subpaths_valid) generate_mergeappend_paths(root, rel, live_childrels, - all_child_pathkeys); + all_child_pathkeys, partition_join_path); /* * Build Append paths for each parameterization seen among the child rels. @@ -1472,8 +1541,18 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, } if (subpaths_valid) - add_path(rel, (Path *) - create_append_path(rel, subpaths, required_outer, 0)); + { + Path *path; + + if (partition_join_path) + path = (Path *) create_partition_join_path(root, rel, subpaths, + required_outer, NIL); + else + path = (Path *) create_append_path(rel, subpaths, + required_outer, 0); + + add_path(rel, path); + } } } @@ -1499,11 +1578,16 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * parameterized mergejoin plans, it might be worth adding support for * parameterized MergeAppends to feed such joins. (See notes in * optimizer/README for why that might not ever happen, though.) + * + * When called on partitioned join relation with partition_join_path = true, it + * adds PartitionJoinPath with pathkeys instead of MergeAppend path. This path + * is costed based on the costs of sampled child-join and is expanded later + * into MergeAppend plan. */ static void generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, List *live_childrels, - List *all_child_pathkeys) + List *all_child_pathkeys, bool partition_join_path) { ListCell *lcp; @@ -1514,6 +1598,7 @@ generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, List *total_subpaths = NIL; bool startup_neq_total = false; ListCell *lcr; + Path *path; /* Select the child paths for this ordering... */ foreach(lcr, live_childrels) @@ -1560,18 +1645,29 @@ generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, accumulate_append_subpath(total_subpaths, cheapest_total); } - /* ... and build the MergeAppend paths */ - add_path(rel, (Path *) create_merge_append_path(root, - rel, - startup_subpaths, - pathkeys, - NULL)); + /* ... and build the paths */ + if (partition_join_path) + path = (Path *) create_partition_join_path(root, rel, + startup_subpaths, + NULL, pathkeys); + else + path = (Path *) create_merge_append_path(root, rel, + startup_subpaths, + pathkeys, NULL); + add_path(rel, path); + if (startup_neq_total) - add_path(rel, (Path *) create_merge_append_path(root, - rel, - total_subpaths, - pathkeys, - NULL)); + { + if (partition_join_path) + path = (Path *) create_partition_join_path(root, rel, + total_subpaths, + NULL, pathkeys); + else + path = (Path *) create_merge_append_path(root, rel, + total_subpaths, + pathkeys, NULL); + add_path(rel, path); + } } } @@ -2316,8 +2412,17 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels) * Run generate_gather_paths() for each just-processed joinrel. We * could not do this earlier because both regular and partial paths * can get added to a particular joinrel at multiple times within - * join_search_one_level. After that, we're done creating paths for - * the joinrel, so run set_cheapest(). + * join_search_one_level. + * + * Similarly, create paths for joinrels which used partition-wise join + * technique. generate_partition_wise_join_paths() creates paths for + * only few of the child-joins with highest sizes. Though we calculate + * size of a child-join only once; when it gets created, it may be + * deemed empty while considering various join orders within + * join_search_one_level. + * + * After that, we're done creating paths for the joinrel, so run + * set_cheapest(). */ foreach(lc, root->join_rel_level[lev]) { @@ -2326,6 +2431,9 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels) /* Create GatherPaths for any useful partial paths for rel */ generate_gather_paths(root, rel); + /* Create paths for partition-wise joins. */ + generate_partition_wise_join_paths(root, rel); + /* Find and save the cheapest paths for this rel */ set_cheapest(rel); @@ -3011,6 +3119,127 @@ compute_parallel_worker(RelOptInfo *rel, BlockNumber pages) return parallel_workers; } +/* + * Function to compare estimated sizes of two relations to be used with + * qsort(). + */ +static int +compare_rel_size(const void *rel1_p, const void *rel2_p) +{ + RelOptInfo *rel1 = *(RelOptInfo **) rel1_p; + RelOptInfo *rel2 = *(RelOptInfo **) rel2_p; + + return (int) (rel1->rows - rel2->rows); +} + +/* + * generate_partition_wise_join_paths + * + * Create paths representing partition-wise join for given partitioned + * join relation. + * + * The number of paths created for a child-join is same as the number of paths + * created for join between parents. That number grows exponentially with the + * number of base relations being joined. The time and memory consumed to + * create paths for each child-join will be proporional to the number of + * partitions. This will not scale well with thousands of partitions. Instead + * of that we estimate partition-wise join cost based on the costs of sampled + * child-joins. We choose child-joins with higher sizes to have realistic + * estimates. + * + * This must be called after we have considered all joining orders since + * certain join orders may allow us to deem a child-join as dummy. + */ +void +generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel) +{ + List *sampled_children = NIL; + int cnt_parts; + int num_part_to_plan; + int num_parts; + bool partition_join_path = false; + int num_dummy_parts = 0; + RelOptInfo **ordered_part_rels; + + /* Handle only join relations. */ + if (!IS_JOIN_REL(rel)) + return; + + /* + * If none of the join orders for this relation could use partition-wise + * join technique, the join is not partitioned. Reset the partitioning + * scheme. + */ + if (!rel->part_rels) + rel->part_scheme = NULL; + + /* If the relation is not partitioned or is proven dummy, nothing to do. */ + if (!rel->part_scheme || IS_DUMMY_REL(rel)) + return; + + /* Guard against stack overflow due to overly deep partition hierarchy. */ + check_stack_depth(); + + num_parts = rel->part_scheme->nparts; + + /* Calculate number of child-joins to sample. */ + num_part_to_plan = num_parts * sample_partition_fraction; + if (num_part_to_plan < 1) + num_part_to_plan = 1; + + /* Order the child-join relations by their size. */ + ordered_part_rels = (RelOptInfo **) palloc(sizeof(RelOptInfo *) * + num_parts); + memcpy(ordered_part_rels, rel->part_rels, + sizeof(RelOptInfo *) * num_parts); + qsort(ordered_part_rels, num_parts, sizeof(RelOptInfo *), + compare_rel_size); + + /* + * Create paths for the child-joins for required number of largest + * relations. qsort() returns relations ordered in ascending sizes, so + * start from the end of the array. + */ + for (cnt_parts = num_parts - 1; cnt_parts >= 0; cnt_parts--) + { + RelOptInfo *child_rel = rel->part_rels[cnt_parts]; + + /* Create paths for this child. */ + add_paths_to_child_joinrel(root, rel, cnt_parts); + + /* Dummy children will not be scanned, so ingore those. */ + if (IS_DUMMY_REL(child_rel)) + { + num_dummy_parts++; + continue; + } + +#ifdef OPTIMIZER_DEBUG + debug_print_rel(root, rel); +#endif + + sampled_children = lappend(sampled_children, child_rel); + + if (list_length(sampled_children) >= num_part_to_plan) + break; + } + pfree(ordered_part_rels); + + /* + * If the number of samples is same as the number of live children, an + * append path will do. Otherwise, we will cost the partition-wise join + * based on the sampled children using PartitionJoinPath. + */ + if (num_part_to_plan < num_parts - num_dummy_parts) + partition_join_path = true; + + /* Add paths for partition-wise join based on the sampled children. */ + add_paths_to_append_rel(root, rel, sampled_children, partition_join_path); + + if (sampled_children) + list_free(sampled_children); +} + /***************************************************************************** * DEBUG SUPPORT diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index a43daa7..c720115 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -126,6 +126,9 @@ bool enable_nestloop = true; bool enable_material = true; bool enable_mergejoin = true; bool enable_hashjoin = true; +bool enable_partition_wise_join = true; +double partition_wise_plan_weight = DEFAULT_PARTITION_WISE_PLAN_WEIGHT; +double sample_partition_fraction = DEFAULT_SAMPLE_PARTITION_FRACTION; typedef struct { diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 2897245..f80fb25 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -96,6 +96,19 @@ add_paths_to_joinrel(PlannerInfo *root, JoinPathExtraData extra; bool mergejoin_allowed = true; ListCell *lc; + Relids joinrelids; + + /* + * PlannerInfo doesn't contain the SpecialJoinInfos created for joins + * between child relations, even if there is a SpecialJoinInfo node for + * the join between the topmost parents. Hence while calculating Relids + * set representing the restriction, consider relids of topmost parent + * of partitions. + */ + if (joinrel->reloptkind == RELOPT_OTHER_JOINREL) + joinrelids = joinrel->top_parent_relids; + else + joinrelids = joinrel->relids; extra.restrictlist = restrictlist; extra.mergeclause_list = NIL; @@ -149,16 +162,16 @@ add_paths_to_joinrel(PlannerInfo *root, * join has already been proven legal.) If the SJ is relevant, it * presents constraints for joining to anything not in its RHS. */ - if (bms_overlap(joinrel->relids, sjinfo2->min_righthand) && - !bms_overlap(joinrel->relids, sjinfo2->min_lefthand)) + if (bms_overlap(joinrelids, sjinfo2->min_righthand) && + !bms_overlap(joinrelids, sjinfo2->min_lefthand)) extra.param_source_rels = bms_join(extra.param_source_rels, bms_difference(root->all_baserels, sjinfo2->min_righthand)); /* full joins constrain both sides symmetrically */ if (sjinfo2->jointype == JOIN_FULL && - bms_overlap(joinrel->relids, sjinfo2->min_lefthand) && - !bms_overlap(joinrel->relids, sjinfo2->min_righthand)) + bms_overlap(joinrelids, sjinfo2->min_lefthand) && + !bms_overlap(joinrelids, sjinfo2->min_righthand)) extra.param_source_rels = bms_join(extra.param_source_rels, bms_difference(root->all_baserels, sjinfo2->min_lefthand)); diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 936ee0c..7476e8e 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -14,9 +14,14 @@ */ #include "postgres.h" +#include "miscadmin.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" #include "optimizer/joininfo.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" +#include "optimizer/prep.h" +#include "optimizer/cost.h" #include "utils/memutils.h" @@ -35,6 +40,14 @@ static bool restriction_is_constant_false(List *restrictlist, static void populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, RelOptInfo *joinrel, SpecialJoinInfo *sjinfo, List *restrictlist); +static void try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, + RelOptInfo *rel2, RelOptInfo *joinrel, + SpecialJoinInfo *parent_sjinfo, + List *parent_restrictlist); +static bool have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, + JoinType jointype, List *restrictlist); +static int match_expr_to_partition_keys(Expr *expr, RelOptInfo *rel); +static void free_special_join_info(SpecialJoinInfo *sjinfo); /* @@ -731,6 +744,9 @@ make_join_rel(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2) populate_joinrel_with_paths(root, rel1, rel2, joinrel, sjinfo, restrictlist); + /* Apply partition-wise join technique, if possible. */ + try_partition_wise_join(root, rel1, rel2, joinrel, sjinfo, restrictlist); + bms_free(joinrelids); return joinrel; @@ -1269,3 +1285,408 @@ restriction_is_constant_false(List *restrictlist, bool only_pushed_down) } return false; } + +/* Free memory used by SpecialJoinInfo. */ +static void +free_special_join_info(SpecialJoinInfo *sjinfo) +{ + bms_free(sjinfo->min_lefthand); + bms_free(sjinfo->syn_lefthand); + bms_free(sjinfo->syn_righthand); + pfree(sjinfo); +} + +/* + * Assess whether join between given two partitioned relations can be broken + * down into joins between matching partitions; a technique called + * "partition-wise join" + * + * Partition-wise join is possible when a. Joining relations have same + * partitioning scheme b. There exists an equi-join between the partition keys + * of the two relations. + * + * Partition-wise join is planned as follows (details: optimizer/README.) + * + * 1. Create the RelOptInfos for joins between matching partitions i.e + * child-joins and estimate sizes of those. This function is responsible for + * this phase. + * + * 2. Add paths representing partition-wise join. The second phase is + * implemented by generate_partition_wise_join_paths(). In order to save time + * and memory consumed in creating paths for every child-join, we create paths + * for only few child-joins. + * + * 3. Create merge/append plan to combining plans for every child-join, + * creating paths for remaining child-joins. + * + * The RelOptInfo, SpecialJoinInfo and restrictlist for each child join are + * obtained by translating the respective parent join structures. + */ +static void +try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, + RelOptInfo *joinrel, SpecialJoinInfo *parent_sjinfo, + List *parent_restrictlist) +{ + int nparts; + int cnt_parts; + PartitionScheme part_scheme; + PartitionedJoin *partitioned_join; + + /* Guard against stack overflow due to overly deep partition hierarchy. */ + check_stack_depth(); + + /* Nothing to do, if the join relation is not partitioned. */ + if (!joinrel->part_scheme) + return; + + /* + * If any of the joining parent relations is proven empty, either the join + * will be empty (INNER join) or will have the inner side all nullified. We + * take care of such cases when creating join paths for parent relations. + * Nothing to be done here. Also, nothing to do, if the parent join is + * proven empty. + */ + if (IS_DUMMY_REL(rel1) || IS_DUMMY_REL(rel2) || IS_DUMMY_REL(joinrel)) + return; + + /* + * Partitioning scheme in join relation indicates a possibility that the + * join may be partitioned, but it's not necessary that every pair of + * joining relations can use partition-wise join technique. If one of + * joining relations turns out to be unpartitioned, this pair of joining + * relations can not use partition-wise join technique. + */ + if (!rel1->part_scheme || !rel2->part_scheme) + return; + + /* + * If an equi-join condition between the partition keys of the joining + * relations does not exist, this pair of joining relations can not use + * partition-wise technique. + */ + if (!have_partkey_equi_join(rel1, rel2, parent_sjinfo->jointype, + parent_restrictlist)) + return; + + /* + * The partition scheme of the join relation should match that of the + * joining relations. + */ + Assert(joinrel->part_scheme == rel1->part_scheme && + joinrel->part_scheme == rel2->part_scheme); + + /* We should have RelOptInfos of the partitions available. */ + Assert(rel1->part_rels && rel2->part_rels); + + part_scheme = joinrel->part_scheme; + nparts = part_scheme->nparts; + + /* + * We do not store information about valid pairs of joining child + * relations. The pair of joining relations for a child-join can be derived + * from valid pairs of joining parent relations. Amongst the valid pairs of + * parent joining relations, only those which result in partitioned join + * matter for partition-wise join. Remember those so that we can use them + * for creating paths for few child-joins in + * generate_partition_wise_join_paths() later. + */ + partitioned_join = (PartitionedJoin *) palloc(sizeof(PartitionedJoin)); + partitioned_join->rel1 = rel1; + partitioned_join->rel2 = rel2; + partitioned_join->sjinfo = copyObject(parent_sjinfo); + partitioned_join->restrictlist = parent_restrictlist; + joinrel->partitioned_joins = lappend(joinrel->partitioned_joins, + partitioned_join); + + elog(DEBUG3, "join between relations %s and %s is considered for partition-wise join.", + bmsToString(rel1->relids), bmsToString(rel2->relids)); + + /* We are done if child RelOptInfos are already created. */ + if (joinrel->part_rels) + return; + + /* Create all the child RelOptInfos. */ + joinrel->part_rels = (RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts); + + /* + * Create child join relations for this partitioned join. While doing so, + * we estimate sizes of these child join relations. These estimates are + * used to find the representative child relations used for costing the + * partition-wise join later. + */ + for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++) + { + RelOptInfo *child_rel1 = rel1->part_rels[cnt_parts]; + RelOptInfo *child_rel2 = rel2->part_rels[cnt_parts]; + SpecialJoinInfo *child_sjinfo; + List *child_restrictlist; + RelOptInfo *child_joinrel; + + /* We should never try to join two overlapping sets of rels. */ + Assert(!bms_overlap(child_rel1->relids, child_rel2->relids)); + + Assert (!joinrel->part_rels[cnt_parts]); + + child_joinrel = build_child_join_rel(root, child_rel1, child_rel2, + joinrel, parent_sjinfo->jointype); + + joinrel->part_rels[cnt_parts] = child_joinrel; + + /* + * Construct restrictions applicable to the child join from + * those applicable to the parent join. + */ + child_restrictlist = build_child_clauses(root, parent_restrictlist, + find_appinfos_by_relids(root, + child_joinrel->relids)); + + /* + * Construct SpecialJoinInfo from parent join relations's + * SpecialJoinInfo. + */ + child_sjinfo = build_child_join_sjinfo(root, parent_sjinfo, + child_rel1->relids, + child_rel2->relids); + + /* + * Set estimates of the child-joinrel's size. + */ + set_joinrel_size_estimates(root, child_joinrel, child_rel1, child_rel2, + child_sjinfo, child_restrictlist); + + /* + * If the child relations themselves are partitioned, try partition-wise join + * recursively. + */ + try_partition_wise_join(root, child_rel1, child_rel2, child_joinrel, + child_sjinfo, child_restrictlist); + + free_special_join_info(child_sjinfo); + child_sjinfo = NULL; + } +} + +/* + * add_paths_to_child_join + * Add paths to 'child_id'th child of given parent join relation. + * + * The function creates paths for given child-join by joining corresponding + * children of every pair of joining parent relations which produces + * partitioned join. Since we create paths only for sampled child-joins, either + * of the children being joined may not have paths. In that case, this function + * is called recursively to populate paths for those. + */ +void +add_paths_to_child_joinrel(PlannerInfo *root, RelOptInfo *parent_joinrel, + int child_id) +{ + ListCell *lc; + RelOptInfo *child_joinrel = parent_joinrel->part_rels[child_id]; + + Assert(IS_JOIN_REL(parent_joinrel)); + + /* If this child relation already has paths, nothing to do. */ + if (child_joinrel->cheapest_total_path) + return; + + /* A dummy relation will have a dummy path as the cheapest path. */ + Assert(!is_dummy_rel(child_joinrel)); + + /* + * For every partitioned join order, calculate paths for the joining + * child relations and then calculate paths for given child. + */ + foreach (lc, parent_joinrel->partitioned_joins) + { + PartitionedJoin *pj = lfirst(lc); + RelOptInfo *rel1 = pj->rel1; + RelOptInfo *rel2 = pj->rel2; + RelOptInfo *child_rel1 = rel1->part_rels[child_id]; + RelOptInfo *child_rel2 = rel2->part_rels[child_id]; + SpecialJoinInfo *child_sjinfo; + List *child_restrictlist; + + /* + * Add paths to joining relation if it is a join itself. + * Paths for child base relations are created in + * set_append_rel_pathlist(). + */ + if (IS_JOIN_REL(pj->rel1)) + add_paths_to_child_joinrel(root, rel1, child_id); + + if (IS_JOIN_REL(pj->rel2)) + add_paths_to_child_joinrel(root, rel2, child_id); + + /* + * Construct SpecialJoinInfo from parent join relations's + * SpecialJoinInfo. + */ + child_sjinfo = build_child_join_sjinfo(root, pj->sjinfo, + child_rel1->relids, + child_rel2->relids); + + + /* + * Construct restrictions applicable to the child join from + * those applicable to the parent join. + */ + child_restrictlist = build_child_clauses(root, pj->restrictlist, + find_appinfos_by_relids(root, + child_joinrel->relids)); + + /* Add paths for child join. */ + populate_joinrel_with_paths(root, rel1->part_rels[child_id], + rel2->part_rels[child_id], child_joinrel, + child_sjinfo, child_restrictlist); + + /* Add partition-wise join paths for partitioned child-joins. */ + generate_partition_wise_join_paths(root, child_joinrel); + + free_special_join_info(child_sjinfo); + child_sjinfo = NULL; + } + + set_cheapest(child_joinrel); +} + +/* + * Returns true if there exists an equi-join condition for each pair of + * partition key from given relations being joined. + */ +static bool +have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, + JoinType jointype, List *restrictlist) +{ + PartitionScheme part_scheme = rel1->part_scheme; + ListCell *lc; + int cnt_pks; + int num_pks; + bool *pk_has_clause; + + /* + * This function should be called when the joining relations have same + * partitioning scheme. + */ + Assert(rel1->part_scheme == rel2->part_scheme); + Assert(part_scheme); + + num_pks = part_scheme->partnatts; + + pk_has_clause = (bool *) palloc0(sizeof(bool) * num_pks); + + foreach (lc, restrictlist) + { + RestrictInfo *rinfo = lfirst(lc); + OpExpr *opexpr; + Expr *expr1; + Expr *expr2; + int ipk1; + int ipk2; + + /* If processing an outer join, only use its own join clauses. */ + if (IS_OUTER_JOIN(jointype) && rinfo->is_pushed_down) + continue; + + /* Skip clauses which can not be used for a join. */ + if (!rinfo->can_join) + continue; + + /* Skip clauses which are not equality conditions. */ + if (rinfo->hashjoinoperator == InvalidOid && !rinfo->mergeopfamilies) + continue; + + opexpr = (OpExpr *) rinfo->clause; + Assert(is_opclause(opexpr)); + + + /* Match the operands to the relation. */ + if (bms_is_subset(rinfo->left_relids, rel1->relids) && + bms_is_subset(rinfo->right_relids, rel2->relids)) + { + expr1 = linitial(opexpr->args); + expr2 = lsecond(opexpr->args); + } + else if (bms_is_subset(rinfo->left_relids, rel2->relids) && + bms_is_subset(rinfo->right_relids, rel1->relids)) + { + expr1 = lsecond(opexpr->args); + expr2 = linitial(opexpr->args); + } + else + continue; + + /* Associate matching clauses with partition keys. */ + ipk1 = match_expr_to_partition_keys(expr1, rel1); + ipk2 = match_expr_to_partition_keys(expr2, rel2); + + /* + * If the clause refers to different partition keys from + * both relations, it can not be used for partition-wise join. + */ + if (ipk1 != ipk2) + continue; + + /* + * The clause allows partition-wise join if only it uses the same + * operator family as that specified by the partition key. + */ + if (!list_member_oid(rinfo->mergeopfamilies, + part_scheme->partopfamily[ipk1])) + continue; + + /* Mark the partition key as having an equi-join clause. */ + pk_has_clause[ipk1] = true; + } + + /* Check whether every partition key has an equi-join condition. */ + for (cnt_pks = 0; cnt_pks < num_pks; cnt_pks++) + { + if (!pk_has_clause[cnt_pks]) + { + pfree(pk_has_clause); + return false; + } + } + + pfree(pk_has_clause); + return true; +} + +/* + * Find the partition key from the given relation matching the given + * expression. If found, return the index of the partition key, else return -1. + */ +static int +match_expr_to_partition_keys(Expr *expr, RelOptInfo *rel) +{ + int cnt_pks; + int num_pks; + + /* This function should be called only for partitioned relations. */ + Assert(rel->part_scheme); + + num_pks = rel->part_scheme->partnatts; + + /* + * Remove the relabel decoration. We can assume that there is at most one + * RelabelType node; eval_const_expressions() simplifies multiple + * RelabelType nodes into one. + */ + if (IsA(expr, RelabelType)) + expr = (Expr *) ((RelabelType *) expr)->arg; + + for (cnt_pks = 0; cnt_pks < num_pks; cnt_pks++) + { + List *pkexprs = rel->partexprs[cnt_pks]; + ListCell *lc; + + foreach(lc, pkexprs) + { + Expr *pkexpr = lfirst(lc); + if (equal(pkexpr, expr)) + return cnt_pks; + } + } + + return -1; +} diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 997bdcf..fe6b7f8 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -42,6 +42,7 @@ #include "parser/parse_clause.h" #include "parser/parsetree.h" #include "utils/lsyscache.h" +#include "utils/memutils.h" /* @@ -146,6 +147,8 @@ static CustomScan *create_customscan_plan(PlannerInfo *root, static NestLoop *create_nestloop_plan(PlannerInfo *root, NestPath *best_path); static MergeJoin *create_mergejoin_plan(PlannerInfo *root, MergePath *best_path); static HashJoin *create_hashjoin_plan(PlannerInfo *root, HashPath *best_path); +static Plan *create_partition_join_plan(PlannerInfo *root, + PartitionJoinPath *best_path); static Node *replace_nestloop_params(PlannerInfo *root, Node *expr); static Node *replace_nestloop_params_mutator(Node *node, PlannerInfo *root); static void process_subquery_nestloop_params(PlannerInfo *root, @@ -369,12 +372,20 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags) (JoinPath *) best_path); break; case T_Append: - plan = create_append_plan(root, - (AppendPath *) best_path); + if (IsA(best_path, PartitionJoinPath)) + plan = create_partition_join_plan(root, + (PartitionJoinPath *)best_path); + else + plan = create_append_plan(root, + (AppendPath *) best_path); break; case T_MergeAppend: - plan = create_merge_append_plan(root, - (MergeAppendPath *) best_path); + if (IsA(best_path, PartitionJoinPath)) + plan = create_partition_join_plan(root, + (PartitionJoinPath *)best_path); + else + plan = create_merge_append_plan(root, + (MergeAppendPath *) best_path); break; case T_Result: if (IsA(best_path, ProjectionPath)) @@ -3982,6 +3993,214 @@ create_hashjoin_plan(PlannerInfo *root, return join_plan; } +/* + * create_partition_join_plan + * Creates Merge/Append plan consisting of join plans for child-join. + * + * Returns a Plan node. + */ +static Plan * +create_partition_join_plan(PlannerInfo *root, PartitionJoinPath *best_path) +{ + RelOptInfo *joinrel = best_path->path.parent; + int nparts; + int cnt_parts; + List *child_plans = NIL; + List *tlist = build_path_tlist(root, &best_path->path); + Plan *plan; + MemoryContext child_context; + MemoryContext old_context; + List *pathkeys = best_path->path.pathkeys; + StringInfoData mem_context_name; + + /* The relation should be a partitioned join relation. */ + Assert(IS_JOIN_REL(joinrel) && joinrel->part_scheme && + joinrel->partitioned_joins); + + nparts = joinrel->part_scheme->nparts; + + /* Create MergeAppend plan when result is expected to be ordered. */ + if (pathkeys) + { + MergeAppend *node = makeNode(MergeAppend); + plan = &node->plan; + + plan->targetlist = tlist; + + /* Compute sorting info, and adjust MergeAppend's tlist as needed. */ + (void) prepare_sort_from_pathkeys(plan, pathkeys, + best_path->path.parent->relids, + NULL, + true, + &node->numCols, + &node->sortColIdx, + &node->sortOperators, + &node->collations, + &node->nullsFirst); + } + else + { + Append *node = makeNode(Append); + plan = &node->plan; + plan->targetlist = tlist; + } + + /* Fill costs, so that we can cost Sort node, if required. */ + copy_generic_path_info(plan, (Path *) best_path); + + /* + * Create a new memory context for planning child joins. Since this routine + * may be called recursively for tables with subpartitions, we use + * a unique context name for every level of partition by using the lowest + * relid amongst the base relations being joined. + */ + initStringInfo(&mem_context_name); + appendStringInfo(&mem_context_name, "%s_%d", "ChildJoinContext", + bms_next_member(joinrel->relids, -1)); + child_context = AllocSetContextCreate(CurrentMemoryContext, + pstrdup(mem_context_name.data), + ALLOCSET_DEFAULT_SIZES); + pfree(mem_context_name.data); + resetStringInfo(&mem_context_name); + + /* + * Create a paths for all child joins, one child join at a time. The paths + * for every child join are independent i.e. one child does not require + * paths created for the other. In order to avoid accumulating memory + * consumed while creating paths for every child join, we use a fresh + * memory context for every child join. + */ + for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++) + { + RelOptInfo *child_join; + Path *child_path = NULL; + Plan *child_plan; + int numsortkeys; + AttrNumber *sortColIdx; + Oid *sortOperators; + Oid *collations; + bool *nullsFirst; + + /* + * Create paths for the child join in a separate context, so that we + * can reuse the memory used by those paths. + */ + old_context = MemoryContextSwitchTo(child_context); + + add_paths_to_child_joinrel(root, joinrel, cnt_parts); + + child_join = joinrel->part_rels[cnt_parts]; + + + /* Skip empty child. */ + if (IS_DUMMY_REL(child_join)) + { + MemoryContextSwitchTo(old_context); + continue; + } + +#ifdef OPTIMIZER_DEBUG + debug_print_rel(root, rel); +#endif + + /* + * Search for a child path with pathkeys or parameterization + * matching that of the given path. + */ + child_path = get_cheapest_path_for_pathkeys(child_join->pathlist, + best_path->path.pathkeys, + PATH_REQ_OUTER(&best_path->path), + TOTAL_COST); + + if (!child_path) + elog(ERROR, "Could not find a path with required pathkeys."); + + MemoryContextSwitchTo(old_context); + + /* Create plan for the current child. */ + child_plan = create_plan_recurse(root, child_path, CP_EXACT_TLIST); + + if (pathkeys) + { + MergeAppend *node = (MergeAppend *) plan; + + Assert(IsA(node, MergeAppend)); + + /* Compute sorting info, and adjust subplan's tlist as needed */ + child_plan = prepare_sort_from_pathkeys(child_plan, pathkeys, + child_path->parent->relids, + node->sortColIdx, + false, + &numsortkeys, + &sortColIdx, + &sortOperators, + &collations, + &nullsFirst); + + /* + * Check that we got the same sort key information. We just Assert + * that the sortops match, since those depend only on the pathkeys; + * but it seems like a good idea to check the sort column numbers + * explicitly, to ensure the tlists really do match up. + */ + Assert(numsortkeys == node->numCols); + if (memcmp(sortColIdx, node->sortColIdx, + numsortkeys * sizeof(AttrNumber)) != 0) + elog(ERROR, "MergeAppend child's targetlist doesn't match MergeAppend"); + Assert(memcmp(sortOperators, node->sortOperators, + numsortkeys * sizeof(Oid)) == 0); + Assert(memcmp(collations, node->collations, + numsortkeys * sizeof(Oid)) == 0); + Assert(memcmp(nullsFirst, node->nullsFirst, + numsortkeys * sizeof(bool)) == 0); + + /* Now, insert a Sort node if subplan isn't sufficiently ordered */ + if (!pathkeys_contained_in(pathkeys, child_path->pathkeys)) + { + Sort *sort = make_sort(child_plan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + label_sort_with_costsize(root, sort, -1.0); + child_plan = (Plan *) sort; + } + } + + child_plans = lappend(child_plans, child_plan); + + /* + * Reset the child_join memory context to reclaim the memory consumed + * while creating paths. + */ + MemoryContextResetAndDeleteChildren(child_context); + } + + /* Destroy the child context as we do not need it anymore. */ + Assert(CurrentMemoryContext == old_context); + MemoryContextDelete(child_context); + + /* Partitioned relation with all empty children gets a dummy path. */ + Assert(child_plans != NIL); + + if (IsA(plan, MergeAppend)) + { + MergeAppend *node = (MergeAppend *)plan; + + node->mergeplans = child_plans; + } + else + { + Append *node = (Append *)plan; + + Assert(IsA(plan, Append)); + node->appendplans = child_plans; + } + + /* Complete rest of the plan. */ + plan->qual = NIL; + plan->lefttree = NULL; + plan->righttree = NULL; + return plan; +} /***************************************************************************** * diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 6f41979..676204f 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -2179,3 +2179,163 @@ adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, /* Now translate for this child */ return adjust_appendrel_attrs(root, node, list_make1(appinfo)); } + +/* + * build_child_restrictinfo + * Returns a RestrictInfo which is derived from the given RestrictInfo by + * applying the parent-child translation specified by the list of + * AppendRelInfos. + * + * The topmost parent's RestrictInfo maintains a list of child RestrictInfos + * derived from it. If a suitable RestrictInfo is found in that list, it is + * returned as is. If there is no such child RestrictInfo, we translate the given + * RestrictInfo using the given list of AppendRelInfos and stick it in the + * topmost parent's list before returning it to the caller. + */ +RestrictInfo * +build_child_restrictinfo(PlannerInfo *root, RestrictInfo *rinfo, + List *append_rel_infos) +{ + Relids child_required_relids; + ListCell *lc; + RestrictInfo *parent_rinfo; + RestrictInfo *child_rinfo; + MemoryContext old_context; + + child_required_relids = adjust_relid_set(rinfo->required_relids, + append_rel_infos); + + + /* Nothing to do, if the clause does not need any translation. */ + if (bms_equal(child_required_relids, rinfo->required_relids)) + { + bms_free(child_required_relids); + return rinfo; + } + + /* + * Check if we already have the RestrictInfo for the given child in the + * topmost parent's RestrictInfo. + */ + parent_rinfo = rinfo->parent_rinfo ? rinfo->parent_rinfo : rinfo; + foreach (lc, parent_rinfo->child_rinfos) + { + child_rinfo = lfirst(lc); + + if (bms_equal(child_rinfo->required_relids, child_required_relids)) + { + bms_free(child_required_relids); + return child_rinfo; + } + } + + /* + * We didn't find any child restrictinfo for the given child, translate the + * given RestrictInfo and stick it into the parent's list. The clause + * expression may get used in plan, so create the child RestrictInfo in the + * planner's context. + */ + old_context = MemoryContextSwitchTo(root->planner_cxt); + child_rinfo = (RestrictInfo *) adjust_appendrel_attrs(root, (Node *) rinfo, + append_rel_infos); + bms_free(child_required_relids); + parent_rinfo->child_rinfos = lappend(parent_rinfo->child_rinfos, + child_rinfo); + child_rinfo->parent_rinfo = parent_rinfo; + + MemoryContextSwitchTo(old_context); + + return child_rinfo; +} + +/* + * build_child_clauses + * Convenience routine to call build_child_restrictinfo on a list of + * clauses. + */ +List * +build_child_clauses(PlannerInfo *root, List *clauses, List *append_rel_infos) +{ + List *child_clauses = NIL; + ListCell *lc; + + foreach (lc, clauses) + { + RestrictInfo *parent_rinfo = lfirst(lc); + RestrictInfo *child_rinfo; + + Assert(IsA(parent_rinfo, RestrictInfo)); + + child_rinfo = build_child_restrictinfo(root, parent_rinfo, + append_rel_infos); + + child_clauses = lappend(child_clauses, child_rinfo); + } + + return child_clauses; +} + +/* + * find_appinfos_by_relids + * Find AppendRelInfo structures for all relations specified by relids. + */ +List * +find_appinfos_by_relids(PlannerInfo *root, Relids relids) +{ + ListCell *lc; + List *appinfo_list = NIL; + + foreach (lc, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(lc); + + if (bms_is_member(appinfo->child_relid, relids)) + appinfo_list = lappend(appinfo_list, appinfo); + } + + Assert(list_length(appinfo_list) == bms_num_members(relids)); + return appinfo_list; +} + +/* + * Construct the SpecialJoinInfo for a child-join by translating + * SpecialJoinInfo for the join between parents. left_relids and right_relids + * are the relids of left and right side of the join respectively. + */ +SpecialJoinInfo * +build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo, + Relids left_relids, Relids right_relids) +{ + SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo); + MemoryContext old_context; + List *left_appinfos = find_appinfos_by_relids(root, left_relids); + List *right_appinfos = find_appinfos_by_relids(root, right_relids); + + memcpy(sjinfo, parent_sjinfo, sizeof(SpecialJoinInfo)); + + sjinfo->min_lefthand = adjust_relid_set(sjinfo->min_lefthand, + left_appinfos); + sjinfo->min_righthand = adjust_relid_set(sjinfo->min_righthand, + right_appinfos); + sjinfo->syn_lefthand = adjust_relid_set(sjinfo->syn_lefthand, + left_appinfos); + sjinfo->syn_righthand = adjust_relid_set(sjinfo->syn_righthand, + right_appinfos); + + /* + * Replace the Var nodes of parent with those of children in expressions. + * This function may be called within a temporary context, but the + * expressions will be shallow-copied into the plan. Hence copy those in + * the planner's context. + */ + old_context = MemoryContextSwitchTo(root->planner_cxt); + sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs(root, + (Node *) sjinfo->semi_rhs_exprs, + right_appinfos); + MemoryContextSwitchTo(old_context); + + list_free(left_appinfos); + list_free(right_appinfos); + + return sjinfo; +} diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index f440875..d861a49 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -23,7 +23,9 @@ #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/planmain.h" +#include "optimizer/prep.h" #include "optimizer/restrictinfo.h" +#include "optimizer/tlist.h" #include "optimizer/var.h" #include "parser/parsetree.h" #include "utils/lsyscache.h" @@ -2154,6 +2156,117 @@ create_hashjoin_path(PlannerInfo *root, } /* + * create_partition_join_path + * Creates a pathnode that represents partition-wise join for given + * partitioned join relation. + * + * This function is called when we haven't created paths for all the child + * joins. It estimates the number of rows and cost of the PartitionJoinPath + * based upon the number of rows and the cost of representative child-joins + * paths. + */ +PartitionJoinPath * +create_partition_join_path(PlannerInfo *root, RelOptInfo *rel, List *subpaths, + Bitmapset *required_outer, List *pathkeys) +{ + PartitionJoinPath *pathnode = makeNode(PartitionJoinPath); + double subpath_rows = 0; + double subpath_startup_cost = 0; + double subpath_total_cost = 0; + double child_rel_rows = 0; + ListCell *lc; + + Assert(rel->part_scheme); + + pathnode->path.pathtype = pathkeys ? T_MergeAppend : T_Append; + pathnode->path.parent = rel; + pathnode->path.pathtarget = rel->reltarget; + pathnode->path.param_info = get_appendrel_parampathinfo(rel, + required_outer); + pathnode->path.pathkeys = pathkeys; + + /* No parallel paths here. See more details in add_paths_to_append_rel() */ + pathnode->path.parallel_aware = false; + pathnode->path.parallel_safe = false; + pathnode->path.parallel_workers = 0; + + /* Accumulate the number of rows and costs from the given subpaths. */ + foreach (lc, subpaths) + { + Path *subpath = lfirst(lc); + + if (!pathkeys) + { + /* + * Startup cost of an append relation is the startup cost of the + * first subpath. Assume that the given first child will be the + * first child in the final plan as well. + */ + if (lc == list_head(subpaths)) + subpath_startup_cost = subpath->startup_cost; + subpath_total_cost += subpath->total_cost; + } + else if (pathkeys_contained_in(pathkeys, subpath->pathkeys)) + { + /* + * Subpath is adequately ordered, we won't need to sort it. We need + * all the subplans to return their respective first rows, before + * returning a row. So add the startup costs. + */ + subpath_startup_cost += subpath->startup_cost; + subpath_total_cost += subpath->total_cost; + } + else + { + /* We'll need to insert a Sort node, so include cost for that */ + Path sort_path; /* dummy for result of cost_sort */ + + cost_sort(&sort_path, + root, + pathkeys, + subpath->total_cost, + subpath->parent->tuples, + subpath->pathtarget->width, + 0.0, + work_mem, + -1); + subpath_startup_cost += sort_path.startup_cost; + subpath_total_cost += sort_path.total_cost; + } + + subpath_rows += subpath->rows; + child_rel_rows += subpath->parent->rows; + + } + + /* + * For a parameterized path, extrapolate the number of rows for the append + * relation by considering the average selectivity of the parameterization + * across the given children. + */ + if (bms_is_empty(required_outer)) + pathnode->path.rows = rel->rows; + else + pathnode->path.rows = rel->rows * (subpath_rows / child_rel_rows); + + /* Extrapolate the total cost to account for yet-to-be planned children. */ + if (!pathkeys) + pathnode->path.startup_cost = subpath_startup_cost; + else + pathnode->path.startup_cost = (subpath_startup_cost * pathnode->path.rows) / subpath_rows; + pathnode->path.total_cost = (subpath_total_cost * pathnode->path.rows) / subpath_rows; + + /* + * Multiply the costs with scaling factor as specified. Used to encourage + * or discourage use of partition-wise join plans. + */ + pathnode->path.startup_cost *= partition_wise_plan_weight; + pathnode->path.total_cost *= partition_wise_plan_weight; + + return pathnode; +} + +/* * create_projection_path * Creates a pathnode that represents performing a projection. * diff --git a/src/backend/optimizer/util/placeholder.c b/src/backend/optimizer/util/placeholder.c index 698a387..e06bccc 100644 --- a/src/backend/optimizer/util/placeholder.c +++ b/src/backend/optimizer/util/placeholder.c @@ -20,6 +20,7 @@ #include "optimizer/pathnode.h" #include "optimizer/placeholder.h" #include "optimizer/planmain.h" +#include "optimizer/prep.h" #include "optimizer/var.h" #include "utils/lsyscache.h" @@ -414,6 +415,10 @@ add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel, Relids relids = joinrel->relids; ListCell *lc; + /* This function is called only on the parent relations. */ + Assert(!IS_OTHER_REL(joinrel) && !IS_OTHER_REL(outer_rel) && + !IS_OTHER_REL(inner_rel)); + foreach(lc, root->placeholder_list) { PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(lc); @@ -459,3 +464,53 @@ add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel, } } } + +/* + * add_placeholders_to_child_joinrel + * Translate the PHVs in parent's targetlist and add them to the child's + * targetlist. Also adjust the cost + */ +void +add_placeholders_to_child_joinrel(PlannerInfo *root, RelOptInfo *childrel, + RelOptInfo *parentrel) +{ + ListCell *lc; + + /* This function is called only for join relations. */ + Assert(IS_JOIN_REL(childrel) && IS_JOIN_REL(parentrel)); + + /* Ensure child relations is really what it claims to be. */ + Assert(IS_OTHER_REL(childrel)); + + foreach (lc, parentrel->reltarget->exprs) + { + PlaceHolderVar *phv = lfirst(lc); + + if (IsA(phv, PlaceHolderVar)) + { + /* + * In case the placeholder Var refers to any of the parent + * relations, translate it to refer to the corresponding child. + */ + if (bms_overlap(phv->phrels, parentrel->relids) && + childrel->reloptkind == RELOPT_OTHER_JOINREL) + { + List *append_rel_infos; + + append_rel_infos = find_appinfos_by_relids(root, + childrel->relids); + phv = (PlaceHolderVar *) adjust_appendrel_attrs(root, + (Node *) phv, + append_rel_infos); + } + + childrel->reltarget->exprs = lappend(childrel->reltarget->exprs, + phv); + } + } + + /* Adjust the cost and width of child targetlist. */ + childrel->reltarget->cost.startup = parentrel->reltarget->cost.startup; + childrel->reltarget->cost.per_tuple = parentrel->reltarget->cost.per_tuple; + childrel->reltarget->width = parentrel->reltarget->width; +} diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 19982dc..1eed987 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -23,6 +23,7 @@ #include "optimizer/paths.h" #include "optimizer/placeholder.h" #include "optimizer/plancat.h" +#include "optimizer/prep.h" #include "optimizer/restrictinfo.h" #include "optimizer/tlist.h" #include "utils/hsearch.h" @@ -54,6 +55,9 @@ static void set_foreign_rel_properties(RelOptInfo *joinrel, static void add_join_rel(PlannerInfo *root, RelOptInfo *joinrel); extern ParamPathInfo *find_param_path_info(RelOptInfo *rel, Relids required_outer); +static void build_joinrel_partition_info(RelOptInfo *joinrel, + RelOptInfo *outer_rel, RelOptInfo *inner_rel, + JoinType jointype); /* @@ -434,6 +438,9 @@ build_join_rel(PlannerInfo *root, RelOptInfo *joinrel; List *restrictlist; + /* This function should be used only for join between parents. */ + Assert(!IS_OTHER_REL(outer_rel) && !IS_OTHER_REL(inner_rel)); + /* * See if we already have a joinrel for this set of base rels. */ @@ -532,6 +539,10 @@ build_join_rel(PlannerInfo *root, if (bms_is_empty(joinrel->direct_lateral_relids)) joinrel->direct_lateral_relids = NULL; + /* Store the partition information. */ + build_joinrel_partition_info(joinrel, outer_rel, inner_rel, + sjinfo->jointype); + /* * Construct restrict and join clause lists for the new joinrel. (The * caller might or might not need the restrictlist, but I need it anyway @@ -594,6 +605,126 @@ build_join_rel(PlannerInfo *root, return joinrel; } + /* + * build_child_join_rel + * Builds RelOptInfo for joining given two child relations from RelOptInfo + * representing the join between their parents. + * + * 'outer_rel' and 'inner_rel' are the RelOptInfos of child relations being + * joined. + * 'parent_joinrel' is the RelOptInfo representing the join between parent + * relations. Most of the members of new RelOptInfo are produced by + * translating corresponding members of this RelOptInfo. + * 'sjinfo': context info for child join + * 'restrictlist': list of RestrictInfo nodes that apply to this particular + * pair of joinable relations. + * 'join_appinfos': list of AppendRelInfo nodes for base child relations involved + * in this join. + */ +RelOptInfo * +build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, + RelOptInfo *inner_rel, RelOptInfo *parent_joinrel, + JoinType jointype) +{ + RelOptInfo *joinrel = makeNode(RelOptInfo); + + /* Only joins between other relations land here. */ + Assert(IS_OTHER_REL(outer_rel) && IS_OTHER_REL(inner_rel)); + + joinrel->reloptkind = RELOPT_OTHER_JOINREL; + joinrel->relids = bms_union(outer_rel->relids, inner_rel->relids); + joinrel->rows = 0; + /* cheap startup cost is interesting iff not all tuples to be retrieved */ + joinrel->consider_startup = (root->tuple_fraction > 0); + joinrel->consider_param_startup = false; + joinrel->consider_parallel = false; + joinrel->reltarget = create_empty_pathtarget(); + joinrel->pathlist = NIL; + joinrel->ppilist = NIL; + joinrel->partial_pathlist = NIL; + joinrel->cheapest_startup_path = NULL; + joinrel->cheapest_total_path = NULL; + joinrel->cheapest_unique_path = NULL; + joinrel->cheapest_parameterized_paths = NIL; + joinrel->direct_lateral_relids = NULL; + joinrel->lateral_relids = NULL; + joinrel->relid = 0; /* indicates not a baserel */ + joinrel->rtekind = RTE_JOIN; + joinrel->min_attr = 0; + joinrel->max_attr = 0; + joinrel->attr_needed = NULL; + joinrel->attr_widths = NULL; + joinrel->lateral_vars = NIL; + joinrel->lateral_referencers = NULL; + joinrel->indexlist = NIL; + joinrel->pages = 0; + joinrel->tuples = 0; + joinrel->allvisfrac = 0; + joinrel->subroot = NULL; + joinrel->subplan_params = NIL; + joinrel->serverid = InvalidOid; + joinrel->userid = InvalidOid; + joinrel->useridiscurrent = false; + joinrel->fdwroutine = NULL; + joinrel->fdw_private = NULL; + joinrel->baserestrictinfo = NIL; + joinrel->baserestrictcost.startup = 0; + joinrel->baserestrictcost.per_tuple = 0; + joinrel->joininfo = NIL; + joinrel->has_eclass_joins = false; + joinrel->part_scheme = NULL; + joinrel->partexprs = NULL; + joinrel->top_parent_relids = NULL; + joinrel->part_rels = NULL; + + joinrel->top_parent_relids = bms_union(outer_rel->top_parent_relids, + inner_rel->top_parent_relids); + + /* Compute information relevant to foreign relations. */ + set_foreign_rel_properties(joinrel, outer_rel, inner_rel); + + /* Build targetlist */ + build_joinrel_tlist(root, joinrel, outer_rel); + build_joinrel_tlist(root, joinrel, inner_rel); + /* Add placeholder variables. */ + add_placeholders_to_child_joinrel(root, joinrel, parent_joinrel); + + /* Construct joininfo list. */ + joinrel->joininfo = build_child_clauses(root, parent_joinrel->joininfo, + find_appinfos_by_relids(root, + joinrel->relids)); + + /* + * Lateral relids referred in child join will be same as that referred in + * the parent relation. Throw any partial result computed while building + * the targetlist. + */ + bms_free(joinrel->direct_lateral_relids); + bms_free(joinrel->lateral_relids); + joinrel->direct_lateral_relids = (Relids) bms_copy(parent_joinrel->direct_lateral_relids); + joinrel->lateral_relids = (Relids) bms_copy(parent_joinrel->lateral_relids); + + /* + * If the parent joinrel has pending equivalence classes, so does the + * child. + */ + joinrel->has_eclass_joins = parent_joinrel->has_eclass_joins; + + /* Is the join between partitions itself partitioned? */ + build_joinrel_partition_info(joinrel, outer_rel, inner_rel, jointype); + + /* Child joinrel is parallel safe if parent is parallel safe. */ + joinrel->consider_parallel = parent_joinrel->consider_parallel; + + /* We build the join only once. */ + Assert(!find_join_rel(root, joinrel->relids)); + + /* Add the relation to the PlannerInfo. */ + add_join_rel(root, joinrel); + + return joinrel; +} + /* * min_join_parameterization * @@ -649,9 +780,15 @@ static void build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, RelOptInfo *input_rel) { - Relids relids = joinrel->relids; + Relids relids; ListCell *vars; + /* attrs_needed refers to parent relids and not those of a child. */ + if (joinrel->top_parent_relids) + relids = joinrel->top_parent_relids; + else + relids = joinrel->relids; + foreach(vars, input_rel->reltarget->exprs) { Var *var = (Var *) lfirst(vars); @@ -667,23 +804,47 @@ build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, /* * Otherwise, anything in a baserel or joinrel targetlist ought to be - * a Var. (More general cases can only appear in appendrel child - * rels, which will never be seen here.) + * a Var or ConvertRowtypeExpr introduced while translating parent + * targetlist to that of the child. */ - if (!IsA(var, Var)) + if (IsA(var, Var)) + { + /* Get the Var's original base rel */ + baserel = find_base_rel(root, var->varno); + + /* Is it still needed above this joinrel? */ + ndx = var->varattno - baserel->min_attr; + } + else if (IsA(var, ConvertRowtypeExpr)) + { + ConvertRowtypeExpr *child_expr = (ConvertRowtypeExpr *) var; + Var *childvar = (Var *) child_expr->arg; + + /* + * Child's whole-row references are converted to that of parent + * using ConvertRowtypeExpr. In this case, the argument to + * ConvertRowtypeExpr is expected to be a whole-row reference of + * the child. + */ + Assert(IsA(childvar, Var) && childvar->varattno == 0); + + baserel = find_base_rel(root, childvar->varno); + ndx = 0 - baserel->min_attr; + } + else elog(ERROR, "unexpected node type in rel targetlist: %d", (int) nodeTag(var)); - /* Get the Var's original base rel */ - baserel = find_base_rel(root, var->varno); - - /* Is it still needed above this joinrel? */ - ndx = var->varattno - baserel->min_attr; if (bms_nonempty_difference(baserel->attr_needed[ndx], relids)) { /* Yup, add it to the output */ joinrel->reltarget->exprs = lappend(joinrel->reltarget->exprs, var); - /* Vars have cost zero, so no need to adjust reltarget->cost */ + + /* + * Vars have cost zero, so no need to adjust reltarget->cost. Even + * if, it's a ConvertRowtypeExpr, it will be computed only for the + * base relation, costing nothing for a join. + */ joinrel->reltarget->width += baserel->attr_widths[ndx]; } } @@ -820,6 +981,9 @@ subbuild_joinrel_joinlist(RelOptInfo *joinrel, { ListCell *l; + /* Expected to be called only for join between parent relations. */ + Assert(joinrel->reloptkind == RELOPT_JOINREL); + foreach(l, joininfo_list) { RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); @@ -1366,3 +1530,85 @@ find_param_path_info(RelOptInfo *rel, Relids required_outer) return NULL; } + +/* + * build_joinrel_partition_info + * If the join between given partitioned relations is possibly partitioned + * set the partitioning scheme and partition keys expressions for the + * join. + * + * If the two relations have same partitioning scheme, their join may be + * partitioned and will follow the same partitioning scheme as the joining + * relations. + */ +static void +build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel, + RelOptInfo *inner_rel, JoinType jointype) +{ + int num_pks; + int cnt; + + /* Nothing to do if partition-wise join technique is disabled. */ + if (!enable_partition_wise_join) + { + joinrel->part_scheme = NULL; + return; + } + + /* + * The join is not partitioned, if any of the relations being joined are + * not partitioned or they do not have same partitioning scheme. + */ + if (!outer_rel->part_scheme || !inner_rel->part_scheme || + outer_rel->part_scheme != inner_rel->part_scheme) + { + joinrel->part_scheme = NULL; + return; + } + + /* + * This function will be called only once for each joinrel, hence it should + * not have partition scheme, partition key expressions and array for + * storing child relations set. + */ + Assert(!joinrel->part_scheme && !joinrel->partexprs && + !joinrel->part_rels); + + /* + * Join relation is partitioned using same partitioning scheme as the + * joining relations. + */ + joinrel->part_scheme = outer_rel->part_scheme; + num_pks = joinrel->part_scheme->partnatts; + + /* + * Construct partition keys for the join. + * + * An INNER join between two partitioned relations is partition by key + * expressions from both the relations. For tables A and B partitioned by a and b + * respectively, (A INNER JOIN B ON A.a = B.b) is partitioned by both A.a + * and B.b. + * + * An OUTER join like (A LEFT JOIN B ON A.a = B.b) may produce rows with + * B.b NULL. These rows may not fit the partitioning conditions imposed on + * B.b. Hence, strictly speaking, the join is not partitioned by B.b. + * Strictly speaking, partition keys of an OUTER join should include + * partition key expressions from the OUTER side only. Consider a join like + * (A LEFT JOIN B on (A.a = B.b) LEFT JOIN C ON B.b = C.c. If we do not + * include B.b as partition key expression for (AB), it prohibits us from + * using partition-wise join when joining (AB) with C as there is no + * equi-join between partition keys of joining relations. But two NULL + * values are never equal and no two rows from mis-matching partitions can + * join. Hence it's safe to include B.b as partition key expression for + * (AB), even though rows in (AB) are not strictly partitioned by B.b. + */ + joinrel->partexprs = (List **) palloc0(sizeof(List *) * num_pks); + for (cnt = 0; cnt < num_pks; cnt++) + { + List *pkexpr = list_copy(outer_rel->partexprs[cnt]); + + pkexpr = list_concat(pkexpr, + list_copy(inner_rel->partexprs[cnt])); + joinrel->partexprs[cnt] = pkexpr; + } +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index de85eca..afd0c23 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -901,6 +901,15 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_partition_wise_join", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables partition-wise join."), + NULL + }, + &enable_partition_wise_join, + true, + NULL, NULL, NULL + }, { {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, @@ -2947,6 +2956,25 @@ static struct config_real ConfigureNamesReal[] = }, { + {"partition_wise_plan_weight", PGC_USERSET, QUERY_TUNING_OTHER, + gettext_noop("Multiplication factor for partition-wise plan costs."), + NULL + }, + &partition_wise_plan_weight, + DEFAULT_PARTITION_WISE_PLAN_WEIGHT, 0, DBL_MAX, + NULL, NULL, NULL + }, + { + {"sample_partition_fraction", PGC_USERSET, QUERY_TUNING_OTHER, + gettext_noop("Fraction of partitions to be used as sample for calculating total cost of partition-wise plans."), + NULL + }, + &sample_partition_fraction, + DEFAULT_SAMPLE_PARTITION_FRACTION, 0, 1, + NULL, NULL, NULL + }, + + { {"bgwriter_lru_multiplier", PGC_SIGHUP, RESOURCES_BGWRITER, gettext_noop("Multiple of the average buffer usage to free per round."), NULL diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 95dd8ba..292d9a6 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -240,6 +240,7 @@ typedef enum NodeTag T_NestPath, T_MergePath, T_HashPath, + T_PartitionJoinPath, T_AppendPath, T_MergeAppendPath, T_ResultPath, diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 4f99184..146d53b 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -391,6 +391,11 @@ typedef struct PartitionSchemeData *PartitionScheme; * handling join alias Vars. Currently this is not needed because all join * alias Vars are expanded to non-aliased form during preprocess_expression. * + * We also have relations representing joins between child relations of + * different partitioned tables. These relations are not added to + * join_rel_level lists as they are not joined directly by the dynamic + * programming algorithm. + * * There is also a RelOptKind for "upper" relations, which are RelOptInfos * that describe post-scan/join processing steps, such as aggregation. * Many of the fields in these RelOptInfos are meaningless, but their Path @@ -512,10 +517,19 @@ typedef enum RelOptKind RELOPT_BASEREL, RELOPT_JOINREL, RELOPT_OTHER_MEMBER_REL, + RELOPT_OTHER_JOINREL, RELOPT_UPPER_REL, RELOPT_DEADREL } RelOptKind; +#define IS_OTHER_REL(rel) \ + ((rel)->reloptkind == RELOPT_OTHER_MEMBER_REL || \ + (rel)->reloptkind == RELOPT_OTHER_JOINREL) + +#define IS_JOIN_REL(rel) \ + ((rel)->reloptkind == RELOPT_JOINREL || \ + (rel)->reloptkind == RELOPT_OTHER_JOINREL) + typedef struct RelOptInfo { NodeTag type; @@ -600,6 +614,14 @@ typedef struct RelOptInfo * as the number of joining * relations. */ + + /* For joins between partitioned tables. */ + List *partitioned_joins; /* List of join orders which yield + * relations partitioned by above + * partition scheme. + */ + /* Set only for "other" base or "other" join relations. */ + Relids top_parent_relids; /* Relids of topmost parents. */ } RelOptInfo; /* @@ -1540,6 +1562,14 @@ typedef struct LimitPath Node *limitCount; /* COUNT parameter, or NULL if none */ } LimitPath; +/* + * PartitionJoinPath represents partition-wise join between two partitioned + * tables. + */ +typedef struct PartitionJoinPath +{ + Path path; +} PartitionJoinPath; /* * Restriction clause info. @@ -1747,6 +1777,18 @@ typedef struct RestrictInfo /* cache space for hashclause processing; -1 if not yet set */ Selectivity left_bucketsize; /* avg bucketsize of left side */ Selectivity right_bucketsize; /* avg bucketsize of right side */ + + /* + * Repository to locate child RestrictInfos derived from parent + * RestrictInfo. Every derived child RestrictInfo points to the parent + * RestrictInfo from which it is derived. Parent RestrictInfo maintains a + * list of all derived child RestrictInfos. So only one of the following + * should be set. + */ + List *child_rinfos; /* RestrictInfos derived for children. */ + struct RestrictInfo *parent_rinfo; /* Parent restrictinfo this + * RestrictInf is derived from. + */ } RestrictInfo; /* @@ -1869,6 +1911,21 @@ typedef struct SpecialJoinInfo } SpecialJoinInfo; /* + * Partitioned join information + * + * Saves information about relations which can be joined partition-wise and + * thus produce result which is partitioned by the partition scheme of the + * relation. + */ +typedef struct PartitionedJoin +{ + RelOptInfo *rel1; + RelOptInfo *rel2; + SpecialJoinInfo *sjinfo; /* SpecialJoinInfo applicable. */ + List *restrictlist; /* applicable join clauses. */ +} PartitionedJoin; + +/* * Append-relation info. * * When we expand an inheritable table or a UNION-ALL subselect into an diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 0e68264..a13eff1 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -30,6 +30,8 @@ #define DEFAULT_PARALLEL_SETUP_COST 1000.0 #define DEFAULT_EFFECTIVE_CACHE_SIZE 524288 /* measured in pages */ +#define DEFAULT_PARTITION_WISE_PLAN_WEIGHT 1 +#define DEFAULT_SAMPLE_PARTITION_FRACTION 0.01 typedef enum { @@ -66,7 +68,10 @@ extern bool enable_nestloop; extern bool enable_material; extern bool enable_mergejoin; extern bool enable_hashjoin; +extern bool enable_partition_wise_join; extern int constraint_exclusion; +extern double partition_wise_plan_weight; +extern double sample_partition_fraction; extern double clamp_row_est(double nrows); extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 7b41317..81d637a 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -229,6 +229,9 @@ extern LimitPath *create_limit_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, Node *limitOffset, Node *limitCount, int64 offset_est, int64 count_est); +extern PartitionJoinPath *create_partition_join_path(PlannerInfo *root, + RelOptInfo *rel, List *subpaths, + Bitmapset *required_outer, List *pathkeys); extern Path *reparameterize_path(PlannerInfo *root, Path *path, Relids required_outer, @@ -271,5 +274,8 @@ extern ParamPathInfo *get_joinrel_parampathinfo(PlannerInfo *root, List **restrict_clauses); extern ParamPathInfo *get_appendrel_parampathinfo(RelOptInfo *appendrel, Relids required_outer); +extern RelOptInfo *build_child_join_rel(PlannerInfo *root, + RelOptInfo *outer_rel, RelOptInfo *inner_rel, + RelOptInfo *parent_joinrel, JoinType jointype); #endif /* PATHNODE_H */ diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 81a9be7..7ad19be 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -53,6 +53,8 @@ extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels); extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel); +extern void generate_partition_wise_join_paths(PlannerInfo *root, + RelOptInfo *rel); #ifdef OPTIMIZER_DEBUG extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel); @@ -106,6 +108,9 @@ extern bool have_join_order_restriction(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2); extern bool have_dangerous_phv(PlannerInfo *root, Relids outer_relids, Relids inner_params); +extern void add_paths_to_child_joinrel(PlannerInfo *root, + RelOptInfo *parent_joinrel, + int child_id); /* * equivclass.c diff --git a/src/include/optimizer/placeholder.h b/src/include/optimizer/placeholder.h index 11e6403..8598268 100644 --- a/src/include/optimizer/placeholder.h +++ b/src/include/optimizer/placeholder.h @@ -28,5 +28,7 @@ extern void fix_placeholder_input_needed_levels(PlannerInfo *root); extern void add_placeholders_to_base_rels(PlannerInfo *root); extern void add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel, RelOptInfo *outer_rel, RelOptInfo *inner_rel); +extern void add_placeholders_to_child_joinrel(PlannerInfo *root, + RelOptInfo *childrel, RelOptInfo *parentrel); #endif /* PLACEHOLDER_H */ diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h index a02e06a..5832130 100644 --- a/src/include/optimizer/prep.h +++ b/src/include/optimizer/prep.h @@ -57,5 +57,13 @@ extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node, extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, RelOptInfo *child_rel); +extern RestrictInfo *build_child_restrictinfo(PlannerInfo *root, + RestrictInfo *rinfo, List *append_rel_infos); +extern List *build_child_clauses(PlannerInfo *root, List *clauses, + List *append_rel_infos); +extern List *find_appinfos_by_relids(PlannerInfo *root, Relids relids); +extern SpecialJoinInfo *build_child_join_sjinfo(PlannerInfo *root, + SpecialJoinInfo *parent_sjinfo, + Relids left_relids, Relids right_relids); #endif /* PREP_H */ diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index 18238fa..79779d6 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -2,6 +2,10 @@ -- PARTITION_JOIN -- Test partition-wise join between partitioned tables -- +-- Usually partition-wise join paths are chosen when data is large, which would +-- take regression tests to run longer. So, weigh partition-wise joins cheaper +-- to force those even for smaller data. +SET partition_wise_plan_weight to 0.2; -- -- partitioned by a single column -- diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index d48abd7..c6c1405 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -70,20 +70,21 @@ select count(*) >= 0 as ok from pg_prepared_xacts; -- This is to record the prevailing planner enable_foo settings during -- a regression test run. select name, setting from pg_settings where name like 'enable%'; - name | setting -----------------------+--------- - enable_bitmapscan | on - enable_hashagg | on - enable_hashjoin | on - enable_indexonlyscan | on - enable_indexscan | on - enable_material | on - enable_mergejoin | on - enable_nestloop | on - enable_seqscan | on - enable_sort | on - enable_tidscan | on -(11 rows) + name | setting +----------------------------+--------- + enable_bitmapscan | on + enable_hashagg | on + enable_hashjoin | on + enable_indexonlyscan | on + enable_indexscan | on + enable_material | on + enable_mergejoin | on + enable_nestloop | on + enable_partition_wise_join | on + enable_seqscan | on + enable_sort | on + enable_tidscan | on +(12 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql index 0322f1e..9b2baeb 100644 --- a/src/test/regress/sql/partition_join.sql +++ b/src/test/regress/sql/partition_join.sql @@ -3,6 +3,11 @@ -- Test partition-wise join between partitioned tables -- +-- Usually partition-wise join paths are chosen when data is large, which would +-- take regression tests to run longer. So, weigh partition-wise joins cheaper +-- to force those even for smaller data. +SET partition_wise_plan_weight to 0.2; + -- -- partitioned by a single column -- -- 1.7.9.5