From ecd8259e853fd39b5035f1f426bf01fe73a555bc Mon Sep 17 00:00:00 2001
From: "dgrowley@gmail.com" <dgrowley@gmail.com>
Date: Fri, 23 Nov 2018 12:06:20 +1300
Subject: [PATCH v3] On demand locking of partitions during INSERT and UPDATE

During INSERT, even if we were inserting a single row into a partitioned
table, we would obtain a lock on every partition which was a direct or
an indirect partition of the insert target table.  This was done in order
to provide a consistent order to the locking of the partitions, which
happens to be the same order that partitions are locked during planning.
The problem with locking all these partitions was that if a partitioned
table had many partitions and the INSERT inserted one, or just a few rows,
the overhead of the locking could be significantly more than the inserting
the actual row(s).

This commit changes the locking to only lock partitions the first time we
route a tuple to them, so if you insert one row, then only 1 leaf
partition will be locked, plus any sub-partitioned tables that we search
through before we find the correct home of the tuple.  This does mean that
the locking order of partitions during INSERT does become less well
defined. Previously operations such as CREATE INDEX and TRUNCATE when
performed on leaf partitions could defend against deadlocking with
concurrent INSERT by performing the operation in table oid order. However,
to deadlock, such DDL would have had to have been performed inside a
transaction and not in table oid order.  With this commit it's now possible
to get deadlocks even if the DDL is performed in table oid order.   If
required such transactions can defend against such deadlocks by performing
a LOCK TABLE on the partitioned table before performing the DDL.
---
 src/backend/executor/execPartition.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 2a7bc01563..ab69404daa 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -191,9 +191,6 @@ static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
  * tuple routing for partitioned tables, encapsulates it in
  * PartitionTupleRouting, and returns it.
  *
- * Note that all the relations in the partition tree are locked using the
- * RowExclusiveLock mode upon return from this function.
- *
  * Callers must use the returned PartitionTupleRouting during calls to
  * ExecFindPartition().  The actual ResultRelInfo for a partition is only
  * allocated when the partition is found for the first time.
@@ -208,9 +205,6 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 	PartitionTupleRouting *proute;
 	ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
 
-	/* Lock all the partitions. */
-	(void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
-
 	/*
 	 * Here we attempt to expend as little effort as possible in setting up
 	 * the PartitionTupleRouting.  Each partition's ResultRelInfo is built on
@@ -487,8 +481,9 @@ ExecHashSubPlanResultRelsByOid(ModifyTableState *mtstate,
 
 /*
  * ExecInitPartitionInfo
- *		Initialize ResultRelInfo and other information for a partition
- *		and store it in the next empty slot in the proute->partitions array.
+ *		Lock the partition and initialize ResultRelInfo.  Also setup other
+ *		information for the partition and store it in the next empty slot in
+ *		the proute->partitions array.
  *
  * Returns the ResultRelInfo
  */
@@ -510,11 +505,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
 
 	oldcxt = MemoryContextSwitchTo(proute->memcxt);
 
-	/*
-	 * We locked all the partitions in ExecSetupPartitionTupleRouting
-	 * including the leaf partitions.
-	 */
-	partrel = table_open(dispatch->partdesc->oids[partidx], NoLock);
+	partrel = table_open(dispatch->partdesc->oids[partidx], RowExclusiveLock);
 
 	leaf_part_rri = makeNode(ResultRelInfo);
 	InitResultRelInfo(leaf_part_rri,
@@ -964,11 +955,12 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 
 /*
  * ExecInitPartitionDispatchInfo
- *		Initialize PartitionDispatch for a partitioned table and store it in
- *		the next available slot in the proute->partition_dispatch_info array.
- *		Also, record the index into this array in the parent_pd->indexes[]
- *		array in the partidx element so that we can properly retrieve the
- *		newly created PartitionDispatch later.
+ *		Lock the partitioned table (if not locked already) and initialize
+ *		PartitionDispatch for a partitioned table and store it in the next
+ *		available slot in the proute->partition_dispatch_info array.  Also,
+ *		record the index into this array in the parent_pd->indexes[] array in
+ *		the partidx element so that we can properly retrieve the newly created
+ *		PartitionDispatch later.
  */
 static PartitionDispatch
 ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid,
@@ -982,8 +974,13 @@ ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid,
 
 	oldcxt = MemoryContextSwitchTo(proute->memcxt);
 
+	/*
+	 * Only sub-partitioned tables need to be locked here.  The root
+	 * partitioned table will already have been locked as it's referenced in
+	 * the query's rtable.
+	 */
 	if (partoid != RelationGetRelid(proute->partition_root))
-		rel = table_open(partoid, NoLock);
+		rel = table_open(partoid, RowExclusiveLock);
 	else
 		rel = proute->partition_root;
 	partdesc = RelationGetPartitionDesc(rel);
-- 
2.16.2.windows.1