>From f2d3149b370a27d812ad7f830fdd1e1e9a5088d3 Mon Sep 17 00:00:00 2001 From: amit Date: Fri, 18 Mar 2016 19:48:49 +0900 Subject: [PATCH 6/6] Introduce tuple routing for partitioned tables. A tuple inserted into a partitioned table using insert or copy is now routed to the appropriate leaf partition, recursively. State required for the same is built in advance by BeginCopy() and ExecInitModifyTable(). It consists of N ResultRelInfos and pointers to TupleConversionMap structs, where N is the number of leaf partitions in the multi-level partition tree. One needs only to get a pointer to the root PartitionDescNode node for the tree. A tuple conversion map is only allocated if the tuple descriptor of a leaf partition differs from that of the root table. Partition check expression is not initialized for leaf partition result relations, as they are redundant in this case. Whether or not to build one is now determined by new parameter load_partition_check of InitResultRelInfo(). CopyFrom() or ExecInsert() passes the root node (set up initially, as just mentioned) along with the tuple slot to get_partition_for_tuple() which traverses the tree to determine the appropriate leaf partition for the tuple. It returns the index (0..N) of the leaf partition. Caller then switches the result relation (ResultRelInfo) appropriately and performs the tuple conversion, if required, before following up with the row triggers, constraints, actual heap and index insertion. * Notes about foreign partitions: Foreign partitions require special consideration as a suitable insert plan needs to be created in advance. So, make_modifytable() invokes PlanForeignModify() for every leaf partition that is a foreign table. Resulting plans are stored in ModifyTable.fdwPrivLists. Subsequently, ExecInitModifyTable() initializes ForeignModifys for each FDW partition using the information in fdwPrivLists. Similarly, BeginCopy() sets up CopyState.partition_fdw_priv_lists (process in this case is more dramatic than in ModifyTable's) and later CopyFrom() initializes ForeignModifys. * Following requires further consideration: Both CopyFrom() and ExecInsert() switch to the partition before invoking BR triggers and back after AR trigger are processed. That ignores parent's triggers completely. Note that those triggers are not required to be present in each partition. OTOH, constraints are fine (both check and not null), since parent's constraints are present in partitions. --- src/backend/catalog/partition.c | 352 +++++++++++++++++++++++++++++++ src/backend/commands/copy.c | 198 +++++++++++++++++- src/backend/commands/tablecmds.c | 1 + src/backend/executor/execMain.c | 46 ++++- src/backend/executor/nodeModifyTable.c | 116 ++++++++++ src/backend/optimizer/plan/createplan.c | 59 +++++ src/backend/optimizer/util/plancat.c | 13 ++ src/backend/parser/analyze.c | 8 + src/include/catalog/partition.h | 25 +++ src/include/executor/executor.h | 6 + src/include/nodes/execnodes.h | 10 + src/include/optimizer/plancat.h | 1 + 12 files changed, 828 insertions(+), 7 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 3404823..69fc362 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -96,6 +96,21 @@ static List *generate_partition_check_qual(Relation rel); static PartitionDescNode GetPartitionDescNodeRecurse(Relation rel, int offset); static int get_leaf_partition_count(PartitionDescNode pdnode); +static PartitionKeyExecInfo *BuildPartitionKeyExecInfo(Relation rel); +static void FormPartitionKeyDatum(PartitionKeyExecInfo *pkinfo, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull); +static int list_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, + Datum value, bool isnull); +static int range_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, + Datum *values); +static int range_partition_bsearch(PartitionKey key, PartitionDesc pdesc, + Datum *values); +static bool rightof(PartitionKey key, Datum *values, PartitionRangeBound *bound); +static bool leftof(PartitionKey key, Datum *values, PartitionRangeBound *bound); + /* * StorePartitionKey * Store the partition keys of rel into pg_partitioned catalog @@ -978,6 +993,34 @@ get_partition_parent(Oid relid) } /* + * get_leaf_partitions + * Returns a list of all leaf-level partitions of relation with OID + * 'relid'. + */ +List * +get_leaf_partitions(Oid relid, int lockmode) +{ + List *partitions, + *result = NIL; + ListCell *lc; + + partitions = get_partitions(relid, lockmode); + + foreach(lc, partitions) + { + Oid myoid = lfirst_oid(lc); + + if (relid_is_partitioned(myoid)) + result = list_concat(result, + get_leaf_partitions(myoid, lockmode)); + + result = lappend_oid(result, myoid); + } + + return result; +} + +/* * RelationDropPartitions * Find and drop all partitions of relation with OID 'relid' */ @@ -1699,6 +1742,7 @@ GetPartitionDescNodeRecurse(Relation rel, int offset) /* First build our own node */ root = (PartitionDescNode) palloc0(sizeof(PartitionDescNodeData)); + root->pkinfo = BuildPartitionKeyExecInfo(rel); root->pdesc = pdesc; root->relid = RelationGetRelid(rel); root->offset = offset; @@ -1816,3 +1860,311 @@ get_leaf_partition_oids(PartitionDescNode pdnode) return result; } + +/* ---------------- + * BuildPartitionKeyExecInfo + * Construct a list of PartitionKeyExecInfo records for an open + * relation + * + * PartitionKeyExecInfo stores the information about the partition key + * that's needed when inserting tuples into a partitioned table; especially, + * partition key expression state if there are any expression columns in + * the partition key. Normally we build a PartitionKeyExecInfo for a + * partitioned table just once per command, and then use it for (potentially) + * many tuples. + * ---------------- + */ +static PartitionKeyExecInfo * +BuildPartitionKeyExecInfo(Relation rel) +{ + PartitionKeyExecInfo *pkinfo; + + pkinfo = (PartitionKeyExecInfo *) palloc0(sizeof(PartitionKeyExecInfo)); + pkinfo->pi_Key = CopyPartitionKey(rel->rd_partkey); + pkinfo->pi_ExpressionState = NIL; + + return pkinfo; +} + +/* + * get_partition_for_tuple + * Recursively finds the "leaf" partition for tuple (slot) + * + * Returns -1 if no partition is found and sets *failed_at to the OID of + * the partitioned table whose partition was not found. + */ +int +get_partition_for_tuple(PartitionDescNode pdnode, + TupleTableSlot *slot, + EState *estate, + Oid *failed_at) +{ + PartitionKeyExecInfo *pkinfo = pdnode->pkinfo; + PartitionDescNode node; + Datum values[PARTITION_MAX_KEYS]; + bool isnull[PARTITION_MAX_KEYS]; + int i; + int index; + + /* Guard against stack overflow due to overly deep partition tree */ + check_stack_depth(); + + if (pdnode->pdesc->nparts == 0) + { + *failed_at = pdnode->relid; + return -1; + } + + /* Extract partition key from tuple */ + Assert(GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + FormPartitionKeyDatum(pkinfo, slot, estate, values, isnull); + + /* Disallow nulls, if range partition key */ + for (i = 0; i < pkinfo->pi_Key->partnatts; i++) + if (isnull[i] && pkinfo->pi_Key->strategy == PARTITION_STRAT_RANGE) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("range partition key contains null"))); + + switch (pkinfo->pi_Key->strategy) + { + case PARTITION_STRAT_LIST: + index = list_partition_for_tuple(pkinfo->pi_Key, pdnode->pdesc, + values[0], isnull[0]); + break; + + case PARTITION_STRAT_RANGE: + index = range_partition_for_tuple(pkinfo->pi_Key, pdnode->pdesc, + values); + break; + } + + /* Don't recurse, if no partition exists for the key at this level... */ + if (index < 0) + { + *failed_at = pdnode->relid; + return index; + } + + /* ... or, if the index'th partition is a leaf partition. */ + if (!relid_is_partitioned(pdnode->pdesc->oids[index])) + { + PartitionDescNode prev; + + /* Patiently determine own leaf partition index */ + prev = node = pdnode->downlink; + if (node && node->index < index) + { + while (node) + { + if (node->index > index) + { + node = prev; + break; + } + + prev = node; + node = node->next; + } + + if (!node) + node = prev; + + /* + * Account for siblings on left that are partition trees and those + * that are leaf partitions themsleves. + */ + return node->offset + node->num_leaf_partitions + + (index - node->index - 1); + } + else + /* Only leaf partitions in pdnode->offset..index */ + return pdnode->offset + index; + + } + + /* Recurse, by locating the index'th partition's node */ + node = pdnode->downlink; + while (node->next != NULL && node->index != index) + node = node->next; + Assert (node != NULL); + + return get_partition_for_tuple(node, slot, estate, failed_at); +} + +/* + * FormPartitionKeyDatum + * Construct values[] and isnull[] arrays for partition key columns + */ +static void +FormPartitionKeyDatum(PartitionKeyExecInfo *pkinfo, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull) +{ + ListCell *partexpr_item; + int i; + + if (pkinfo->pi_Key->partexprs != NIL && pkinfo->pi_ExpressionState == NIL) + { + /* First time through, set up expression evaluation state */ + pkinfo->pi_ExpressionState = (List *) + ExecPrepareExpr((Expr *) pkinfo->pi_Key->partexprs, + estate); + /* Check caller has set up context correctly */ + Assert(GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + } + + partexpr_item = list_head(pkinfo->pi_ExpressionState); + for (i = 0; i < pkinfo->pi_Key->partnatts; i++) + { + AttrNumber keycol = pkinfo->pi_Key->partattrs[i]; + Datum pkDatum; + bool isNull; + + if (keycol != 0) + { + /* Plain column; get the value directly from the heap tuple */ + pkDatum = slot_getattr(slot, keycol, &isNull); + } + else + { + /* Expression; need to evaluate it */ + if (partexpr_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + pkDatum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), + GetPerTupleExprContext(estate), + &isNull, + NULL); + partexpr_item = lnext(partexpr_item); + } + values[i] = pkDatum; + isnull[i] = isNull; + } + + if (partexpr_item != NULL) + elog(ERROR, "wrong number of partition key expressions"); +} + +/* + * list_partition_for_tuple + * Find the list partition for a tuple + * + * Returns -1 if none found. + */ +static int +list_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, + Datum value, bool isnull) +{ + int i; + + Assert(pdesc->nparts > 0); + + for (i = 0; i < pdesc->nparts; i++) + { + int j; + + if (isnull) + { + if (pdesc->lists[i]->contains_null) + return i; + + continue; + } + + for (j = 0; j < pdesc->lists[i]->nvalues; j++) + { + int32 cmpval; + + cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], + key->tcinfo->typcoll[0], + pdesc->lists[i]->values[j], + value)); + if (!cmpval) + return i; + } + } + + return -1; +} + +/* + * range_partition_for_tuple + * Search the range partition for a range key ('values') + * + * Returns -1 if none found. + */ +static int +range_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, Datum *values) +{ + Assert(pdesc->nparts > 0); + + return range_partition_bsearch(key, pdesc, values); +} + +/* + * range_partition_bsearch + * Workhorse of range_partition_for_tuple + */ +static int +range_partition_bsearch(PartitionKey key, PartitionDesc pdesc, + Datum *values) +{ + int low, high; + + /* Good ol' bsearch */ + low = 0; + high = pdesc->nparts - 1; + while (low <= high) + { + int idx = (low + high) / 2; + + if (pdesc->rangeuppers[idx]->infinite) + { + if (rightof(key, values, pdesc->rangelowers[idx])) + return idx; + + break; + } + else if (leftof(key, values, pdesc->rangeuppers[idx])) + { + if (pdesc->rangelowers[idx]->infinite) + return idx; + + if (rightof(key, values, pdesc->rangelowers[idx])) + return idx; + + high = idx - 1; + continue; + } + + low = idx + 1; + } + + return -1; +} + +/* Does range key lie to the right of partition bound */ +static bool +rightof(PartitionKey key, Datum *values, PartitionRangeBound *bound) +{ + int32 cmpval = compare_range_keys(key, values, bound->val); + + if (!cmpval) + return bound->lower ? bound->inclusive : !bound->inclusive; + + return cmpval > 0; +} + +/* Does range key lie to the left of partition bound */ +static bool +leftof(PartitionKey key, Datum *values, PartitionRangeBound *bound) +{ + int32 cmpval = compare_range_keys(key, values, bound->val); + + if (!cmpval) + return !bound->lower ? bound->inclusive : !bound->inclusive; + + return cmpval < 0; +} diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 76803a5..0e2c61a 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -30,6 +30,7 @@ #include "commands/defrem.h" #include "commands/trigger.h" #include "executor/executor.h" +#include "foreign/fdwapi.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "mb/pg_wchar.h" @@ -161,6 +162,11 @@ typedef struct CopyStateData ExprState **defexprs; /* array of default att expressions */ bool volatile_defexprs; /* is any of defexprs volatile? */ List *range_table; + PartitionDescNode pdnode; /* partition descriptor node tree */ + ResultRelInfo *partitions; + TupleConversionMap **partition_tupconv_maps; + List *partition_fdw_priv_lists; + int num_partitions; /* * These variables are used to reduce overhead in textual COPY FROM. @@ -1364,6 +1370,87 @@ BeginCopy(bool is_from, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("table \"%s\" does not have OIDs", RelationGetRelationName(cstate->rel)))); + + /* + * Initialize state for CopyFrom tuple routing. Watch out for + * any foreign partitions. + */ + if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_REL) + { + List *leaf_part_oids; + ListCell *cell; + int i; + int num_leaf_parts; + ResultRelInfo *leaf_rel_rri; + PlannerInfo *root = makeNode(PlannerInfo); /* mostly dummy */ + Query *parse = makeNode(Query); /* ditto */ + ModifyTable *plan = makeNode(ModifyTable); /* ditto */ + RangeTblEntry *fdw_rte = makeNode(RangeTblEntry); /* ditto */ + List *fdw_private_lists = NIL; + + cstate->pdnode = RelationGetPartitionDescNode(rel); + leaf_part_oids = get_leaf_partition_oids(cstate->pdnode); + num_leaf_parts = list_length(leaf_part_oids); + + cstate->num_partitions = num_leaf_parts; + cstate->partitions = (ResultRelInfo *) + palloc0(num_leaf_parts * sizeof(ResultRelInfo)); + cstate->partition_tupconv_maps = (TupleConversionMap **) + palloc0(num_leaf_parts * sizeof(TupleConversionMap *)); + + /* For use below, iff a partition found to be a foreign table */ + plan->operation = CMD_INSERT; + plan->plans = list_make1(makeNode(Result)); + fdw_rte->rtekind = RTE_RELATION; + fdw_rte->relkind = RELKIND_FOREIGN_TABLE; + parse->rtable = list_make1(fdw_rte); + root->parse = parse; + + leaf_rel_rri = cstate->partitions; + i = 0; + foreach(cell, leaf_part_oids) + { + Relation leaf_rel; + + leaf_rel = heap_open(lfirst_oid(cell), RowExclusiveLock); + InitResultRelInfo(leaf_rel_rri, + leaf_rel, + 1, /* dummy */ + false, /* no need for partition check */ + 0); + + /* Open partition indices */ + ExecOpenIndices(leaf_rel_rri, false); + + /* Special dance for foreign tables */ + if (leaf_rel_rri->ri_FdwRoutine) + { + List *fdw_private; + + fdw_rte->relid = RelationGetRelid(leaf_rel); + fdw_private = leaf_rel_rri->ri_FdwRoutine->PlanForeignModify(root, + plan, + 1, + 0); + fdw_private_lists = lappend(fdw_private_lists, fdw_private); + } + + if (!equalTupleDescs(tupDesc, RelationGetDescr(leaf_rel))) + cstate->partition_tupconv_maps[i] = + convert_tuples_by_name(tupDesc, + RelationGetDescr(leaf_rel), + gettext_noop("could not convert row type")); + + leaf_rel_rri++; + i++; + } + + cstate->partition_fdw_priv_lists = fdw_private_lists; + pfree(fdw_rte); + pfree(plan); + pfree(parse); + pfree(root); + } } else { @@ -1659,6 +1746,8 @@ ClosePipeToProgram(CopyState cstate) static void EndCopy(CopyState cstate) { + int i; + if (cstate->is_program) { ClosePipeToProgram(cstate); @@ -1672,6 +1761,23 @@ EndCopy(CopyState cstate) cstate->filename))); } + /* Close all partitions and indices thereof */ + for (i = 0; i < cstate->num_partitions; i++) + { + ResultRelInfo *resultRelInfo = cstate->partitions + i; + + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + + /* XXX - EState not handy here to pass to EndForeignModify() */ + if (resultRelInfo->ri_FdwRoutine && + resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL) + resultRelInfo->ri_FdwRoutine->EndForeignModify(NULL, resultRelInfo); + + if (cstate->partition_tupconv_maps[i]) + pfree(cstate->partition_tupconv_maps[i]); + } + MemoryContextDelete(cstate->copycontext); pfree(cstate); } @@ -2216,6 +2322,7 @@ CopyFrom(CopyState cstate) Datum *values; bool *nulls; ResultRelInfo *resultRelInfo; + ResultRelInfo *saved_resultRelInfo = NULL; EState *estate = CreateExecutorState(); /* for ExecConstraints() */ ExprContext *econtext; TupleTableSlot *myslot; @@ -2236,7 +2343,8 @@ CopyFrom(CopyState cstate) Assert(cstate->rel); - if (cstate->rel->rd_rel->relkind != RELKIND_RELATION) + if (cstate->rel->rd_rel->relkind != RELKIND_RELATION && + cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_REL) { if (cstate->rel->rd_rel->relkind == RELKIND_VIEW) ereport(ERROR, @@ -2344,6 +2452,7 @@ CopyFrom(CopyState cstate) InitResultRelInfo(resultRelInfo, cstate->rel, 1, /* dummy rangetable index */ + true, /* do load partition check expression */ 0); ExecOpenIndices(resultRelInfo, false); @@ -2371,6 +2480,7 @@ CopyFrom(CopyState cstate) if ((resultRelInfo->ri_TrigDesc != NULL && (resultRelInfo->ri_TrigDesc->trig_insert_before_row || resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) || + cstate->pdnode != NULL || cstate->volatile_defexprs) { useHeapMultiInsert = false; @@ -2392,10 +2502,46 @@ CopyFrom(CopyState cstate) */ ExecBSInsertTriggers(estate, resultRelInfo); + /* Initialize FDW partition insert plans */ + if (cstate->pdnode) + { + int i, + j; + List *fdw_private_lists = cstate->partition_fdw_priv_lists; + ModifyTableState *mtstate = makeNode(ModifyTableState); + ResultRelInfo *leaf_part_rri; + + /* Mostly dummy containing enough state for BeginForeignModify */ + mtstate->ps.state = estate; + mtstate->operation = CMD_INSERT; + + j = 0; + leaf_part_rri = cstate->partitions; + for (i = 0; i < cstate->num_partitions; i++) + { + if (leaf_part_rri->ri_FdwRoutine) + { + List *fdw_private; + + Assert(fdw_private_lists); + fdw_private = list_nth(fdw_private_lists, j++); + leaf_part_rri->ri_FdwRoutine->BeginForeignModify(mtstate, + leaf_part_rri, + fdw_private, + 0, 0); + } + leaf_part_rri++; + } + } + values = (Datum *) palloc(tupDesc->natts * sizeof(Datum)); nulls = (bool *) palloc(tupDesc->natts * sizeof(bool)); - bistate = GetBulkInsertState(); + if (useHeapMultiInsert) + bistate = GetBulkInsertState(); + else + bistate = NULL; + econtext = GetPerTupleExprContext(estate); /* Set up callback to identify error line number */ @@ -2447,6 +2593,31 @@ CopyFrom(CopyState cstate) slot = myslot; ExecStoreTuple(tuple, slot, InvalidBuffer, false); + /* Determine the partition */ + saved_resultRelInfo = resultRelInfo; + if (cstate->pdnode) + { + int i_leaf_partition; + TupleConversionMap *map; + + econtext->ecxt_scantuple = slot; + i_leaf_partition = ExecFindPartition(resultRelInfo, + cstate->pdnode, + slot, + estate); + Assert(i_leaf_partition >= 0 && + i_leaf_partition < cstate->num_partitions); + + resultRelInfo = cstate->partitions + i_leaf_partition; + estate->es_result_relation_info = resultRelInfo; + + map = cstate->partition_tupconv_maps[i_leaf_partition]; + if (map) + tuple = do_convert_tuple(tuple, map); + + tuple->t_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + } + skip_tuple = false; /* BEFORE ROW INSERT Triggers */ @@ -2467,7 +2638,16 @@ CopyFrom(CopyState cstate) if (cstate->rel->rd_att->constr || resultRelInfo->ri_PartitionCheck) ExecConstraints(resultRelInfo, slot, estate); - if (useHeapMultiInsert) + if (resultRelInfo->ri_FdwRoutine) + { + resultRelInfo->ri_FdwRoutine->ExecForeignInsert(estate, + resultRelInfo, + slot, + NULL); + /* AFTER ROW INSERT Triggers */ + ExecARInsertTriggers(estate, resultRelInfo, tuple, NIL); + } + else if (useHeapMultiInsert) { /* Add this tuple to the tuple buffer */ if (nBufferedTuples == 0) @@ -2497,7 +2677,8 @@ CopyFrom(CopyState cstate) List *recheckIndexes = NIL; /* OK, store the tuple and create index entries for it */ - heap_insert(cstate->rel, tuple, mycid, hi_options, bistate); + heap_insert(resultRelInfo->ri_RelationDesc, + tuple, mycid, hi_options, bistate); if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), @@ -2517,6 +2698,12 @@ CopyFrom(CopyState cstate) * tuples inserted by an INSERT command. */ processed++; + + if (saved_resultRelInfo) + { + resultRelInfo = saved_resultRelInfo; + estate->es_result_relation_info = resultRelInfo; + } } } @@ -2530,7 +2717,8 @@ CopyFrom(CopyState cstate) /* Done, clean up */ error_context_stack = errcallback.previous; - FreeBulkInsertState(bistate); + if (bistate) + FreeBulkInsertState(bistate); MemoryContextSwitchTo(oldcontext); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 5b83b8d..5911659 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -1237,6 +1237,7 @@ ExecuteTruncate(TruncateStmt *stmt) InitResultRelInfo(resultRelInfo, rel, 0, /* dummy rangetable index */ + false, 0); resultRelInfo++; } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index bbd2187..0d86e14 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -826,6 +826,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) InitResultRelInfo(resultRelInfo, resultRelation, resultRelationIndex, + true, estate->es_instrument); resultRelInfo++; } @@ -1215,6 +1216,7 @@ void InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, + bool load_partition_check, int instrument_options) { MemSet(resultRelInfo, 0, sizeof(ResultRelInfo)); @@ -1252,8 +1254,9 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_ConstraintExprs = NULL; resultRelInfo->ri_junkFilter = NULL; resultRelInfo->ri_projectReturning = NULL; - resultRelInfo->ri_PartitionCheck = (Expr *) - RelationGetPartitionCheckQual(resultRelationDesc); + if (load_partition_check) + resultRelInfo->ri_PartitionCheck = (Expr *) + RelationGetPartitionCheckQual(resultRelationDesc); } /* @@ -1316,6 +1319,7 @@ ExecGetTriggerResultRel(EState *estate, Oid relid) InitResultRelInfo(rInfo, rel, 0, /* dummy rangetable index */ + true, estate->es_instrument); estate->es_trig_target_relations = lappend(estate->es_trig_target_relations, rInfo); @@ -2996,3 +3000,41 @@ EvalPlanQualEnd(EPQState *epqstate) epqstate->planstate = NULL; epqstate->origslot = NULL; } + +int +ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDescNode pdnode, + TupleTableSlot *slot, EState *estate) +{ + int i_leaf_partition; + Oid failed_at; + + i_leaf_partition = get_partition_for_tuple(pdnode, slot, estate, + &failed_at); + + if (i_leaf_partition < 0) + { + Relation rel = resultRelInfo->ri_RelationDesc; + char *val_desc; + Bitmapset *insertedCols, + *updatedCols, + *modifiedCols; + TupleDesc tupDesc = RelationGetDescr(rel); + + insertedCols = GetInsertedColumns(resultRelInfo, estate); + updatedCols = GetUpdatedColumns(resultRelInfo, estate); + modifiedCols = bms_union(insertedCols, updatedCols); + val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), + slot, + tupDesc, + modifiedCols, + 64); + Assert(OidIsValid(failed_at)); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("no partition of relation \"%s\" found for row", + get_rel_name(failed_at)), + val_desc ? errdetail("Failing row contains %s.", val_desc) : 0)); + } + + return i_leaf_partition; +} diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 3ed321e..7fc2c19 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -243,6 +243,7 @@ ExecInsert(ModifyTableState *mtstate, { HeapTuple tuple; ResultRelInfo *resultRelInfo; + ResultRelInfo *saved_resultRelInfo = NULL; Relation resultRelationDesc; Oid newId; List *recheckIndexes = NIL; @@ -257,6 +258,31 @@ ExecInsert(ModifyTableState *mtstate, * get information on the (current) result relation */ resultRelInfo = estate->es_result_relation_info; + + saved_resultRelInfo = resultRelInfo; + + if (mtstate->mt_partition_node) + { + int i_leaf_partition; + ExprContext *econtext = GetPerTupleExprContext(estate); + TupleConversionMap *map; + + econtext->ecxt_scantuple = slot; + i_leaf_partition = ExecFindPartition(resultRelInfo, + mtstate->mt_partition_node, + slot, + estate); + Assert(i_leaf_partition >= 0 && + i_leaf_partition < mtstate->mt_num_partitions); + + resultRelInfo = mtstate->mt_partitions + i_leaf_partition; + estate->es_result_relation_info = resultRelInfo; + + map = mtstate->mt_partition_tupconv_maps[i_leaf_partition]; + if (map) + tuple = do_convert_tuple(tuple, map); + } + resultRelationDesc = resultRelInfo->ri_RelationDesc; /* @@ -496,6 +522,12 @@ ExecInsert(ModifyTableState *mtstate, list_free(recheckIndexes); + if (saved_resultRelInfo) + { + resultRelInfo = saved_resultRelInfo; + estate->es_result_relation_info = resultRelInfo; + } + /* * Check any WITH CHECK OPTION constraints from parent views. We are * required to do this after testing all constraints and uniqueness @@ -1550,6 +1582,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) Plan *subplan; ListCell *l; int i; + Relation rel; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -1640,6 +1673,72 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_result_relation_info = saved_resultRelInfo; + /* Build state for INSERT tuple routing */ + rel = mtstate->resultRelInfo->ri_RelationDesc; + if (operation == CMD_INSERT && + rel->rd_rel->relkind == RELKIND_PARTITIONED_REL) + { + int i, + j, + num_leaf_parts; + List *leaf_part_oids; + ListCell *cell; + ResultRelInfo *leaf_rel_rri; + + mtstate->mt_partition_node = RelationGetPartitionDescNode(rel); + leaf_part_oids = get_leaf_partition_oids(mtstate->mt_partition_node); + num_leaf_parts = list_length(leaf_part_oids); + + mtstate->mt_num_partitions = num_leaf_parts; + mtstate->mt_partitions = (ResultRelInfo *) + palloc0(num_leaf_parts * sizeof(ResultRelInfo)); + mtstate->mt_partition_tupconv_maps = (TupleConversionMap **) + palloc0(num_leaf_parts * sizeof(TupleConversionMap *)); + + leaf_rel_rri = mtstate->mt_partitions; + i = j = 0; + foreach(cell, leaf_part_oids) + { + Relation leaf_rel; + + leaf_rel = heap_open(lfirst_oid(cell), RowExclusiveLock); + InitResultRelInfo(leaf_rel_rri, + leaf_rel, + 1, /* dummy */ + false, /* no need for partition checks */ + eflags); + + /* Open partition indices (note: ON CONFLICT unsupported)*/ + if (leaf_rel_rri->ri_RelationDesc->rd_rel->relhasindex && + operation != CMD_DELETE && + leaf_rel_rri->ri_IndexRelationDescs == NULL) + ExecOpenIndices(leaf_rel_rri, false); + + if (leaf_rel_rri->ri_FdwRoutine) + { + /* As many fdw_private's in fdwPrivLists as FDW partitions */ + List *fdw_private = (List *) list_nth(node->fdwPrivLists, j); + + leaf_rel_rri->ri_FdwRoutine->BeginForeignModify(mtstate, + leaf_rel_rri, + fdw_private, + 0, + eflags); + j++; + } + + if (!equalTupleDescs(RelationGetDescr(rel), + RelationGetDescr(leaf_rel))) + mtstate->mt_partition_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(rel), + RelationGetDescr(leaf_rel), + gettext_noop("could not convert row type")); + + leaf_rel_rri++; + i++; + } + } + /* * Initialize any WITH CHECK OPTION constraints if needed. */ @@ -1957,6 +2056,23 @@ ExecEndModifyTable(ModifyTableState *node) resultRelInfo); } + /* Close all partitions and indices thereof */ + for (i = 0; i < node->mt_num_partitions; i++) + { + ResultRelInfo *resultRelInfo = node->mt_partitions + i; + + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + + if (resultRelInfo->ri_FdwRoutine && + resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL) + resultRelInfo->ri_FdwRoutine->EndForeignModify(node->ps.state, + resultRelInfo); + + if (node->mt_partition_tupconv_maps[i]) + pfree(node->mt_partition_tupconv_maps[i]); + } + /* * Free the exprcontext */ diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 58bfd49..9e5b60e 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -6149,6 +6149,65 @@ make_modifytable(PlannerInfo *root, node->fdwPrivLists = fdw_private_list; node->fdwDirectModifyPlans = direct_modify_plans; + /* Collect insert plans for all FDW-managed partitions */ + if (node->operation == CMD_INSERT) + { + RangeTblEntry *rte, + **saved_simple_rte_array; + List *partition_oids; + + Assert(list_length(resultRelations) == 1); + rte = rt_fetch(linitial_int(resultRelations), root->parse->rtable); + Assert(rte->rtekind == RTE_RELATION); + + if (!relid_is_partitioned(rte->relid)) + return node; + + partition_oids = get_leaf_partitions(rte->relid, NoLock); + + /* Discard any previous content which is useless anyway */ + fdw_private_list = NIL; + + /* To force FDW driver fetch the intended RTE */ + saved_simple_rte_array = root->simple_rte_array; + root->simple_rte_array = (RangeTblEntry **) + palloc0(2 * sizeof(RangeTblEntry *)); + foreach(lc, partition_oids) + { + Oid myoid = lfirst_oid(lc); + FdwRoutine *fdwroutine; + List *fdw_private; + + if (!oid_is_foreign_table(myoid)) + continue; + + fdwroutine = GetFdwRoutineByRelId(myoid); + if (fdwroutine && fdwroutine->PlanForeignModify) + { + RangeTblEntry *fdw_rte; + + fdw_rte = copyObject(rte); + fdw_rte->relid = myoid; + fdw_rte->relkind = RELKIND_FOREIGN_TABLE; + + /* Assumes PlanForeignModify() uses planner_rt_fetch(). */ + root->simple_rte_array[1] = fdw_rte; + + fdw_private = fdwroutine->PlanForeignModify(root, node, 1, 0); + pfree(fdw_rte); + } + else + fdw_private = NIL; + + fdw_private_list = lappend(fdw_private_list, fdw_private); + } + + pfree(root->simple_rte_array); + root->simple_rte_array = saved_simple_rte_array; + + node->fdwPrivLists = fdw_private_list; + } + return node; } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 9314074..95aca08 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1690,3 +1690,16 @@ has_row_triggers(PlannerInfo *root, Index rti, CmdType event) heap_close(relation, NoLock); return result; } + +bool +oid_is_foreign_table(Oid relid) +{ + Relation rel; + char relkind; + + rel = heap_open(relid, NoLock); + relkind = rel->rd_rel->relkind; + heap_close(rel, NoLock); + + return relkind == RELKIND_FOREIGN_TABLE; +} diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index b4fca37..6996666 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -777,8 +777,16 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) /* Process ON CONFLICT, if any. */ if (stmt->onConflictClause) + { + /* Bail out if target relation is partitioned table */ + if (relid_is_partitioned(pstate->p_target_rangetblentry->relid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ON CONFLICT clause is not supported with partitioned tables"))); + qry->onConflict = transformOnConflictClause(pstate, stmt->onConflictClause); + } /* * If we have a RETURNING clause, we need to add the target relation to diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index c1120e3..98e5dc0 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -14,6 +14,8 @@ #define PARTITION_H #include "fmgr.h" +#include "executor/tuptable.h" +#include "nodes/execnodes.h" #include "parser/parse_node.h" #include "utils/relcache.h" @@ -86,6 +88,22 @@ typedef struct PartitionDescData typedef struct PartitionDescData *PartitionDesc; /* + * PartitionKeyExecInfo + * + * This struct holds the information needed to extract partition + * column values from a heap tuple. + * + * Key copy of the rd_partkey of rel + * ExpressionState exec state for expressions, or NIL if none + */ +typedef struct PartitionKeyExecInfo +{ + NodeTag type; + PartitionKey pi_Key; + List *pi_ExpressionState; /* list of ExprState */ +} PartitionKeyExecInfo; + +/* * Partition tree node * * pkinfo PartitionKey executor state @@ -101,6 +119,7 @@ typedef struct PartitionDescData *PartitionDesc; */ typedef struct PartitionDescNodeData { + PartitionKeyExecInfo *pkinfo; PartitionDesc pdesc; Oid relid; int index; @@ -128,6 +147,7 @@ extern void StorePartitionBound(Oid relid, Oid parentId, Node *bound); extern void RemovePartitionEntryByRelId(Oid relid); extern bool relid_is_partition(Oid relid); extern Oid get_partition_parent(Oid relid); +extern List *get_leaf_partitions(Oid relid, int lockmode); extern void RelationDropPartitions(Oid relid); extern PartitionInfo **get_partition_info(Relation rel, int *nparts); extern void free_partition_info(PartitionInfo **p, int num); @@ -139,4 +159,9 @@ extern List *get_check_qual_from_partbound(Relation rel, Relation parent, Node *bound); extern List *get_leaf_partition_oids(PartitionDescNode pdnode); extern PartitionDescNode RelationGetPartitionDescNode(Relation rel); + +extern int get_partition_for_tuple(PartitionDescNode pdnode, + TupleTableSlot *slot, + EState *estate, + Oid *failed_at); #endif /* PARTITION_H */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 39521ed..c3b0f6d 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -14,6 +14,7 @@ #ifndef EXECUTOR_H #define EXECUTOR_H +#include "catalog/partition.h" #include "executor/execdesc.h" #include "nodes/parsenodes.h" @@ -188,6 +189,7 @@ extern void CheckValidResultRel(Relation resultRel, CmdType operation); extern void InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, + bool load_partition_check, int instrument_options); extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid); extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); @@ -211,6 +213,10 @@ extern void EvalPlanQualSetPlan(EPQState *epqstate, extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti, HeapTuple tuple); extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti); +extern int ExecFindPartition(ResultRelInfo *resultRelInfo, + PartitionDescNode pdnode, + TupleTableSlot *slot, + EState *estate); #define EvalPlanQualSetSlot(epqstate, slot) ((epqstate)->origslot = (slot)) extern void EvalPlanQualFetchRowMarks(EPQState *epqstate); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 64af12c..e360616 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -16,6 +16,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/tupconvert.h" #include "executor/instrument.h" #include "lib/pairingheap.h" #include "nodes/params.h" @@ -1140,6 +1141,15 @@ typedef struct ModifyTableState * tlist */ TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection * target */ + struct PartitionDescNodeData *mt_partition_node; + /* Partition descriptor node tree */ + ResultRelInfo *mt_partitions; /* Per leaf partition target + * relations */ + TupleConversionMap **mt_partition_tupconv_maps; + /* Per leaf partition + * tuple conversion map */ + int mt_num_partitions; /* Number of leaf partition target + * relations in the above array */ } ModifyTableState; /* ---------------- diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 125274e..fac606c 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -56,5 +56,6 @@ extern Selectivity join_selectivity(PlannerInfo *root, SpecialJoinInfo *sjinfo); extern bool has_row_triggers(PlannerInfo *root, Index rti, CmdType event); +extern bool oid_is_foreign_table(Oid relid); #endif /* PLANCAT_H */ -- 1.7.1