From e8ccff62337f109a0df98df6f2b03e97951f5fe2 Mon Sep 17 00:00:00 2001 From: amitlan Date: Thu, 3 Sep 2020 15:53:04 +0900 Subject: [PATCH v2] Check default partition's constraint even after tuple routing A partition's constraint is not checked if a row has been inserted into it via tuple routing, because the routing process ensures that only the tuples satisfying the constraint make it to a given partition. For a default partition however, whose constraint can change on-the-fly due to concurrent addition of partitions, this assumption does not always hold. So, check the constraint even when inserting into a default partition via tuple routing. This also adds an isolation test based on the reproduction steps described by Hao Wu. Reported by: Hao Wu --- src/backend/executor/execPartition.c | 125 +++++++++++++++++---- .../expected/partition-concurrent-attach.out | 49 ++++++++ src/test/isolation/isolation_schedule | 1 + .../specs/partition-concurrent-attach.spec | 43 +++++++ 4 files changed, 195 insertions(+), 23 deletions(-) create mode 100644 src/test/isolation/expected/partition-concurrent-attach.out create mode 100644 src/test/isolation/specs/partition-concurrent-attach.spec diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 79fcbd6..5910d0e 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -51,6 +51,11 @@ * PartitionDispatchData->indexes for details on how this array is * indexed. * + * nonleaf_partitions + * Array of 'max_dispatch' elements containing pointers to fake + * ResultRelInfo objects for nonleaf partitions, useful for checking + * the partition constraint. + * * num_dispatch * The current number of items stored in the 'partition_dispatch_info' * array. Also serves as the index of the next free array element for @@ -89,6 +94,7 @@ struct PartitionTupleRouting { Relation partition_root; PartitionDispatch *partition_dispatch_info; + ResultRelInfo **nonleaf_partitions; int num_dispatch; int max_dispatch; ResultRelInfo **partitions; @@ -281,8 +287,10 @@ ExecFindPartition(ModifyTableState *mtstate, PartitionDesc partdesc; ExprContext *ecxt = GetPerTupleExprContext(estate); TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; + TupleTableSlot *rootslot = slot; TupleTableSlot *myslot = NULL; MemoryContext oldcxt; + ResultRelInfo *rri = NULL; /* use per-tuple context here to avoid leaking memory */ oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); @@ -296,9 +304,8 @@ ExecFindPartition(ModifyTableState *mtstate, /* start with the root partitioned table */ dispatch = pd[0]; - while (true) + do { - AttrMap *map = dispatch->tupmap; int partidx = -1; CHECK_FOR_INTERRUPTS(); @@ -307,17 +314,6 @@ ExecFindPartition(ModifyTableState *mtstate, partdesc = dispatch->partdesc; /* - * Convert the tuple to this parent's layout, if different from the - * current relation. - */ - myslot = dispatch->tupslot; - if (myslot != NULL) - { - Assert(map != NULL); - slot = execute_attr_map_slot(map, slot, myslot); - } - - /* * Extract partition key from tuple. Expression evaluation machinery * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to * point to the correct tuple slot. The slot might have changed from @@ -352,8 +348,6 @@ ExecFindPartition(ModifyTableState *mtstate, if (partdesc->is_leaf[partidx]) { - ResultRelInfo *rri; - /* * Look to see if we've already got a ResultRelInfo for this * partition. @@ -401,13 +395,8 @@ ExecFindPartition(ModifyTableState *mtstate, rootResultRelInfo, partidx); } - /* Release the tuple in the lowest parent's dedicated slot. */ - if (slot == myslot) - ExecClearTuple(myslot); - - MemoryContextSwitchTo(oldcxt); - ecxt->ecxt_scantuple = ecxt_scantuple_old; - return rri; + /* We've reached the leaf. */ + dispatch = NULL; } else { @@ -419,6 +408,8 @@ ExecFindPartition(ModifyTableState *mtstate, /* Already built. */ Assert(dispatch->indexes[partidx] < proute->num_dispatch); + rri = proute->nonleaf_partitions[dispatch->indexes[partidx]]; + /* * Move down to the next partition level and search again * until we find a leaf partition that matches this tuple @@ -440,10 +431,80 @@ ExecFindPartition(ModifyTableState *mtstate, dispatch, partidx); Assert(dispatch->indexes[partidx] >= 0 && dispatch->indexes[partidx] < proute->num_dispatch); + + rri = proute->nonleaf_partitions[dispatch->indexes[partidx]]; dispatch = subdispatch; } + + /* + * Convert the tuple to the new parent's layout, if different + * from the previous parent. + */ + if (dispatch->tupslot) + { + AttrMap *map = dispatch->tupmap; + TupleTableSlot *prev_myslot = myslot; + + myslot = dispatch->tupslot; + Assert(map != NULL); + slot = execute_attr_map_slot(map, slot, myslot); + + /* Clear previous parent's tuple if any. */ + if (prev_myslot != NULL) + ExecClearTuple(prev_myslot); + } } - } + + /* + * If this partition is the default one, we must check its partition + * constraint, which, unlike a non-default partition's constraint, can + * change due to partitions being added to the parent concurrently. + * + * The tuple must match the partition's layout for the constraint + * expression to be evaluated successfully. If the partition is + * sub-partitioned, that would already be the case due to the code + * above, but for a leaf partition the tuple still matches the + * parent's layout. + */ + if (partidx == partdesc->boundinfo->default_index) + { + PartitionRoutingInfo *partrouteinfo = rri->ri_PartitionInfo; + + if (partrouteinfo) + { + TupleConversionMap *map = partrouteinfo->pi_RootToPartitionMap; + TupleTableSlot *pslot = partrouteinfo->pi_PartitionTupleSlot; + + /* + * As the name says, we can only convert from root table's + * layout, so use the slot received from the caller, rootslot. + */ + if (map) + slot = execute_attr_map_slot(map->attrMap, rootslot, + pslot); + else + /* + * Or use root layout slot as-is. Note we don't use + * the parent's slot, because we don't know if it's same + * layout or different from the leaf partition, but we + * know for sure that root and leaf are the same. + */ + slot = rootslot; + } + + ExecPartitionCheck(rri, slot, estate, true); + } + } while (dispatch != NULL); + + /* Release the tuple in the lowest parent's dedicated slot. */ + if (myslot != NULL) + ExecClearTuple(myslot); + MemoryContextSwitchTo(oldcxt); + ecxt->ecxt_scantuple = ecxt_scantuple_old; + + Assert(rri != NULL); + + return rri; } /* @@ -1060,6 +1121,8 @@ ExecInitPartitionDispatchInfo(EState *estate, proute->max_dispatch = 4; proute->partition_dispatch_info = (PartitionDispatch *) palloc(sizeof(PartitionDispatch) * proute->max_dispatch); + proute->nonleaf_partitions = (ResultRelInfo **) + palloc(sizeof(ResultRelInfo *) * proute->max_dispatch); } else { @@ -1067,11 +1130,27 @@ ExecInitPartitionDispatchInfo(EState *estate, proute->partition_dispatch_info = (PartitionDispatch *) repalloc(proute->partition_dispatch_info, sizeof(PartitionDispatch) * proute->max_dispatch); + proute->nonleaf_partitions = (ResultRelInfo **) + repalloc(proute->nonleaf_partitions, + sizeof(ResultRelInfo *) * proute->max_dispatch); } } proute->partition_dispatch_info[dispatchidx] = pd; /* + * If setting up a PartitionDispatch for a sub-partitioned table, we may + * also need a fake ResultRelInfo for checking the partition constraint + * later; set that up now. + */ + if (parent_pd) + { + ResultRelInfo *rri = makeNode(ResultRelInfo); + + InitResultRelInfo(rri, rel, 1, proute->partition_root, 0); + proute->nonleaf_partitions[dispatchidx] = rri; + } + + /* * Finally, if setting up a PartitionDispatch for a sub-partitioned table, * install a downlink in the parent to allow quick descent. */ diff --git a/src/test/isolation/expected/partition-concurrent-attach.out b/src/test/isolation/expected/partition-concurrent-attach.out new file mode 100644 index 0000000..17fac39 --- /dev/null +++ b/src/test/isolation/expected/partition-concurrent-attach.out @@ -0,0 +1,49 @@ +Parsed test spec with 2 sessions + +starting permutation: s1b s1a s2b s2i s1c s2c s2s +step s1b: begin; +step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); +step s2b: begin; +step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); +step s1c: commit; +step s2i: <... completed> +error in steps s1c s2i: ERROR: new row for relation "tpart_default" violates partition constraint +step s2c: commit; +step s2s: select tableoid::regclass, * from tpart; +tableoid i j + +tpart_2 110 xxx +tpart_2 120 yyy +tpart_2 150 zzz + +starting permutation: s1b s1a s2b s2i2 s1c s2c s2s +step s1b: begin; +step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); +step s2b: begin; +step s2i2: insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); +step s1c: commit; +step s2i2: <... completed> +error in steps s1c s2i2: ERROR: new row for relation "tpart_default" violates partition constraint +step s2c: commit; +step s2s: select tableoid::regclass, * from tpart; +tableoid i j + +tpart_2 110 xxx +tpart_2 120 yyy +tpart_2 150 zzz + +starting permutation: s1b s2b s2i s1a s2c s1c s2s +step s1b: begin; +step s2b: begin; +step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); +step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); +step s2c: commit; +step s1a: <... completed> +error in steps s2c s1a: ERROR: updated partition constraint for default partition "tpart_default_default" would be violated by some row +step s1c: commit; +step s2s: select tableoid::regclass, * from tpart; +tableoid i j + +tpart_default_default110 xxx +tpart_default_default120 yyy +tpart_default_default150 zzz diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 65d1443..f838276 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -90,3 +90,4 @@ test: plpgsql-toast test: truncate-conflict test: serializable-parallel test: serializable-parallel-2 +test: partition-concurrent-attach diff --git a/src/test/isolation/specs/partition-concurrent-attach.spec b/src/test/isolation/specs/partition-concurrent-attach.spec new file mode 100644 index 0000000..48c3f83 --- /dev/null +++ b/src/test/isolation/specs/partition-concurrent-attach.spec @@ -0,0 +1,43 @@ +# Verify that default partition constraint is enforced correctly +# in light of partitions being added concurrently to its parent +setup { + drop table if exists tpart; + create table tpart(i int, j text) partition by range(i); + create table tpart_1(like tpart); + create table tpart_2(like tpart); + create table tpart_default (a int, j text, i int) partition by list (j); + create table tpart_default_default (a int, i int, b int, j text); + alter table tpart_default_default drop b; + alter table tpart_default attach partition tpart_default_default default; + alter table tpart_default drop a; + alter table tpart attach partition tpart_default default; + alter table tpart attach partition tpart_1 for values from(0) to (100); + insert into tpart_2 values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); +} + +session "s1" +step "s1b" { begin; } +step "s1a" { alter table tpart attach partition tpart_2 for values from (100) to (200); } +step "s1c" { commit; } + +session "s2" +step "s2b" { begin; } +step "s2i" { insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); } +step "s2i2" { insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); } +step "s2c" { commit; } +step "s2s" { select tableoid::regclass, * from tpart; } + +teardown { drop table tpart; } + +# insert into tpart by s2 which routes to tpart_default due to not seeing +# concurrently added tpart_2 should fail, because the partition constraint +# of tpart_default would have changed due to tpart_2 having been added +permutation "s1b" "s1a" "s2b" "s2i" "s1c" "s2c" "s2s" + +# similar to above, but now insert into sub-partitioned tpart_default +permutation "s1b" "s1a" "s2b" "s2i2" "s1c" "s2c" "s2s" + +# reverse: now the insert into tpart_default by s2 occurs first followed by +# attach in s1, which should fail when it scans the leaf default partition +# find the violating rows +permutation "s1b" "s2b" "s2i" "s1a" "s2c" "s1c" "s2s" -- 1.8.3.1