Re: unsupportable composite type partition keys - Mailing list pgsql-hackers

From Tom Lane
Subject Re: unsupportable composite type partition keys
Date
Msg-id 24653.1576962784@sss.pgh.pa.us
Whole thread Raw
In response to Re: unsupportable composite type partition keys  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: unsupportable composite type partition keys  (Amit Langote <amitlangote09@gmail.com>)
List pgsql-hackers
I wrote:
> As far as point 2 goes, I think this is another outgrowth of the
> fundamental design error that we load a partitioned rel's partitioning
> info immediately when the relcache entry is created, rather than later
> on-demand.  If we weren't doing that then it wouldn't be problematic
> to inspect the rel's rowtype while constructing the partitioning info.
> I've bitched about this before, if memory serves, but couldn't light
> a fire under anyone about fixing it.  Now I think we have no choice.
> It was never a great idea that minimal construction of a relcache
> entry could result in running arbitrary user-defined code.

Here's a draft patch for that.  There are a couple of secondary issues
I didn't do anything about yet:

* When rebuilding an open relcache entry for a partitioned table, this
coding now always quasi-leaks the old rd_pdcxt, where before that happened
only if the partdesc actually changed.  (Even if I'd kept the
equalPartitionDescs call, it would always fail.)  I complained about the
quasi-leak behavior before, but this probably pushes it up to the level of
"must fix".  What I'm inclined to do is to hack
RelationDecrementReferenceCount so that, when the refcount goes to zero,
we delete any child contexts of rd_pdcxt.  That's pretty annoying but in
the big scheme of things it's unlikely to matter.

* It'd be better to declare RelationGetPartitionKey and
RelationGetPartitionDesc in relcache.h and get their callers out of the
business of including rel.h, where possible.

* equalPartitionDescs is now dead code, should we remove it?

> Note that the end result of this would be to allow, not prohibit,
> cases like your example.  I wonder whether we couldn't also lift
> the restriction against whole-row Vars in partition expressions.
> Doesn't seem like there is much difference between such a Var and
> a row(...)::table_rowtype expression.

I didn't look into that either.  I wouldn't propose back-patching that,
but it'd be interesting to try to fix it in HEAD.

            regards, tom lane

diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 8f242ae..a143998 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -812,7 +812,7 @@ DefineIndex(Oid relationId,
      */
     if (partitioned && (stmt->unique || stmt->primary))
     {
-        PartitionKey key = rel->rd_partkey;
+        PartitionKey key = RelationGetPartitionKey(rel);
         int            i;

         /*
diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c
index d160d6f..a010b3e 100644
--- a/src/backend/partitioning/partbounds.c
+++ b/src/backend/partitioning/partbounds.c
@@ -775,6 +775,9 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
 /*
  * Return a copy of given PartitionBoundInfo structure. The data types of bounds
  * are described by given partition key specification.
+ *
+ * Note: it's critical that this function and its callees not do any
+ * catalog access, because we use it to copy data into the relcache.
  */
 PartitionBoundInfo
 partition_bounds_copy(PartitionBoundInfo src,
diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c
index 6ede084..d0e253d 100644
--- a/src/backend/partitioning/partdesc.c
+++ b/src/backend/partitioning/partdesc.c
@@ -65,10 +65,12 @@ RelationBuildPartitionDesc(Relation rel)
     List       *inhoids;
     PartitionBoundSpec **boundspecs = NULL;
     Oid           *oids = NULL;
+    bool       *is_leaf = NULL;
     ListCell   *cell;
     int            i,
                 nparts;
     PartitionKey key = RelationGetPartitionKey(rel);
+    MemoryContext new_pdcxt;
     MemoryContext oldcxt;
     int           *mapping;

@@ -81,10 +83,11 @@ RelationBuildPartitionDesc(Relation rel)
     inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);
     nparts = list_length(inhoids);

-    /* Allocate arrays for OIDs and boundspecs. */
+    /* Allocate arrays for OIDs, leaf flags, and boundspecs. */
     if (nparts > 0)
     {
-        oids = palloc(nparts * sizeof(Oid));
+        oids = (Oid *) palloc(nparts * sizeof(Oid));
+        is_leaf = (bool *) palloc(nparts * sizeof(bool));
         boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
     }

@@ -172,63 +175,75 @@ RelationBuildPartitionDesc(Relation rel)

         /* Save results. */
         oids[i] = inhrelid;
+        is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
         boundspecs[i] = boundspec;
         ++i;
     }

-    /* Assert we aren't about to leak any old data structure */
-    Assert(rel->rd_pdcxt == NULL);
-    Assert(rel->rd_partdesc == NULL);
+    /*
+     * Create PartitionBoundInfo, working in the caller's context.  This could
+     * fail, but we haven't done any damage if so.
+     */
+    if (nparts > 0)
+        boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);

     /*
      * Now build the actual relcache partition descriptor.  Note that the
      * order of operations here is fairly critical.  If we fail partway
-     * through this code, we won't have leaked memory because the rd_pdcxt is
-     * attached to the relcache entry immediately, so it'll be freed whenever
-     * the entry is rebuilt or destroyed.  However, we don't assign to
-     * rd_partdesc until the cached data structure is fully complete and
-     * valid, so that no other code might try to use it.
+     * through the data-copying code, we won't have leaked memory because the
+     * rd_pdcxt is attached to the relcache entry immediately, so it'll be
+     * freed whenever the entry is rebuilt or destroyed.  However, we don't
+     * assign to rd_partdesc until the cached data structure is fully complete
+     * and valid, so that no other code might try to use it.
      */
-    rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext,
-                                          "partition descriptor",
-                                          ALLOCSET_SMALL_SIZES);
-    MemoryContextCopyAndSetIdentifier(rel->rd_pdcxt,
+    new_pdcxt = AllocSetContextCreate(CacheMemoryContext,
+                                      "partition descriptor",
+                                      ALLOCSET_SMALL_SIZES);
+    MemoryContextCopyAndSetIdentifier(new_pdcxt,
                                       RelationGetRelationName(rel));

+    /*
+     * This is a hack and a half: if there's an old rd_pdcxt, it might contain
+     * an old partition descriptor that is still being referenced somewhere.
+     * Preserve it, while not leaking it, by reattaching it as a child context
+     * of the new rd_pdcxt.  Eventually somebody will clean it up.  See
+     * related hack in RelationClearRelation.
+     */
+    rel->rd_partdesc = NULL;    /* let's just make sure of this */
+    if (rel->rd_pdcxt != NULL)
+        MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
+    rel->rd_pdcxt = new_pdcxt;
+
+    /*
+     * Now copy everything into new_pdcxt.  Note it's critical that none of
+     * this code do any catalog access, else we could end up recursing here.
+     */
     partdesc = (PartitionDescData *)
-        MemoryContextAllocZero(rel->rd_pdcxt, sizeof(PartitionDescData));
+        MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
     partdesc->nparts = nparts;
     /* If there are no partitions, the rest of the partdesc can stay zero */
     if (nparts > 0)
     {
-        /* Create PartitionBoundInfo, using the caller's context. */
-        boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
-
-        /* Now copy all info into relcache's partdesc. */
-        oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt);
+        oldcxt = MemoryContextSwitchTo(new_pdcxt);
         partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
         partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
         partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
-        MemoryContextSwitchTo(oldcxt);

         /*
          * Assign OIDs from the original array into mapped indexes of the
          * result array.  The order of OIDs in the former is defined by the
          * catalog scan that retrieved them, whereas that in the latter is
          * defined by canonicalized representation of the partition bounds.
-         *
-         * Also record leaf-ness of each partition.  For this we use
-         * get_rel_relkind() which may leak memory, so be sure to run it in
-         * the caller's context.
+         * Also save leaf-ness of each partition.
          */
         for (i = 0; i < nparts; i++)
         {
             int            index = mapping[i];

             partdesc->oids[index] = oids[i];
-            partdesc->is_leaf[index] =
-                (get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE);
+            partdesc->is_leaf[index] = is_leaf[i];
         }
+        MemoryContextSwitchTo(oldcxt);
     }

     rel->rd_partdesc = partdesc;
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 50f8912..0a693fef 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1165,20 +1165,11 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
     relation->rd_fkeylist = NIL;
     relation->rd_fkeyvalid = false;

-    /* if a partitioned table, initialize key and partition descriptor info */
-    if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-    {
-        RelationBuildPartitionKey(relation);
-        RelationBuildPartitionDesc(relation);
-    }
-    else
-    {
-        relation->rd_partkey = NULL;
-        relation->rd_partkeycxt = NULL;
-        relation->rd_partdesc = NULL;
-        relation->rd_pdcxt = NULL;
-    }
-    /* ... but partcheck is not loaded till asked for */
+    /* partitioning data is not loaded till asked for */
+    relation->rd_partkey = NULL;
+    relation->rd_partkeycxt = NULL;
+    relation->rd_partdesc = NULL;
+    relation->rd_pdcxt = NULL;
     relation->rd_partcheck = NIL;
     relation->rd_partcheckvalid = false;
     relation->rd_partcheckcxt = NULL;
@@ -2527,7 +2518,6 @@ RelationClearRelation(Relation relation, bool rebuild)
         bool        keep_rules;
         bool        keep_policies;
         bool        keep_partkey;
-        bool        keep_partdesc;

         /* Build temporary entry, but don't link it into hashtable */
         newrel = RelationBuildDesc(save_relid, false);
@@ -2560,9 +2550,6 @@ RelationClearRelation(Relation relation, bool rebuild)
         keep_policies = equalRSDesc(relation->rd_rsdesc, newrel->rd_rsdesc);
         /* partkey is immutable once set up, so we can always keep it */
         keep_partkey = (relation->rd_partkey != NULL);
-        keep_partdesc = equalPartitionDescs(relation->rd_partkey,
-                                            relation->rd_partdesc,
-                                            newrel->rd_partdesc);

         /*
          * Perform swapping of the relcache entry contents.  Within this
@@ -2617,34 +2604,46 @@ RelationClearRelation(Relation relation, bool rebuild)
         SWAPFIELD(Oid, rd_toastoid);
         /* pgstat_info must be preserved */
         SWAPFIELD(struct PgStat_TableStatus *, pgstat_info);
-        /* preserve old partitioning info if no logical change */
+        /* preserve old partition key if we have one */
         if (keep_partkey)
         {
             SWAPFIELD(PartitionKey, rd_partkey);
             SWAPFIELD(MemoryContext, rd_partkeycxt);
         }
-        if (keep_partdesc)
-        {
-            SWAPFIELD(PartitionDesc, rd_partdesc);
-            SWAPFIELD(MemoryContext, rd_pdcxt);
-        }
-        else if (rebuild && newrel->rd_pdcxt != NULL)
+        if (rebuild && newrel->rd_pdcxt != NULL)
         {
             /*
              * We are rebuilding a partitioned relation with a non-zero
              * reference count, so keep the old partition descriptor around,
              * in case there's a PartitionDirectory with a pointer to it.
-             * Attach it to the new rd_pdcxt so that it gets cleaned up
-             * eventually.  In the case where the reference count is 0, this
-             * code is not reached, which should be OK because in that case
-             * there should be no PartitionDirectory with a pointer to the old
-             * entry.
+             * (This is necessary because RelationGetPartitionDesc hands out
+             * direct pointers to the relcache's data structure, unlike our
+             * usual practice which is to hand out copies.  We'd have the same
+             * problem with rd_partkey, except that we always preserve that
+             * once created.)
+             *
+             * To ensure that it's not leaked completely, re-attach it to the
+             * new reldesc, or make it a child of the new reldesc's rd_pdcxt
+             * in the unlikely event that there is one already.  (See hack in
+             * RelationBuildPartitionDesc.)  Currently, it'll stick around
+             * until such time as we destroy the reldesc, or rebuild it while
+             * it has zero refcount.  That's pretty undesirable and should get
+             * fixed.
+             *
+             * In the case where the reference count is 0, this code is not
+             * reached, which should be OK because in that case there should
+             * be no PartitionDirectory with a pointer to the old entry.
              *
              * Note that newrel and relation have already been swapped, so the
              * "old" partition descriptor is actually the one hanging off of
              * newrel.
              */
-            MemoryContextSetParent(newrel->rd_pdcxt, relation->rd_pdcxt);
+            if (relation->rd_pdcxt != NULL) /* probably never happens */
+                MemoryContextSetParent(newrel->rd_pdcxt, relation->rd_pdcxt);
+            else
+                relation->rd_pdcxt = newrel->rd_pdcxt;
+            relation->rd_partdesc = NULL;    /* just to be sure */
+            /* drop newrel's pointers so we don't destroy it below */
             newrel->rd_partdesc = NULL;
             newrel->rd_pdcxt = NULL;
         }
@@ -3907,27 +3906,7 @@ RelationCacheInitializePhase3(void)
             restart = true;
         }

-        /*
-         * Reload the partition key and descriptor for a partitioned table.
-         */
-        if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
-            relation->rd_partkey == NULL)
-        {
-            RelationBuildPartitionKey(relation);
-            Assert(relation->rd_partkey != NULL);
-
-            restart = true;
-        }
-
-        if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
-            relation->rd_partdesc == NULL)
-        {
-            RelationBuildPartitionDesc(relation);
-            Assert(relation->rd_partdesc != NULL);
-
-            restart = true;
-        }
-
+        /* Reload tableam data if needed */
         if (relation->rd_tableam == NULL &&
             (relation->rd_rel->relkind == RELKIND_RELATION ||
              relation->rd_rel->relkind == RELKIND_SEQUENCE ||
@@ -5133,6 +5112,36 @@ RelationGetExclusionInfo(Relation indexRelation,
 }

 /*
+ * RelationGetPartitionKey -- get partition key, if relation is partitioned
+ */
+struct PartitionKeyData *
+RelationGetPartitionKey(Relation rel)
+{
+    if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+        return NULL;
+
+    if (rel->rd_partkey == NULL)
+        RelationBuildPartitionKey(rel);
+
+    return rel->rd_partkey;
+}
+
+/*
+ * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
+ */
+struct PartitionDescData *
+RelationGetPartitionDesc(Relation rel)
+{
+    if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+        return NULL;
+
+    if (rel->rd_partdesc == NULL)
+        RelationBuildPartitionDesc(rel);
+
+    return rel->rd_partdesc;
+}
+
+/*
  * Get publication actions for the given relation.
  */
 struct PublicationActions *
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 31d8a1a..aa8c625 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -596,20 +596,10 @@ typedef struct ViewOptions
      RelationNeedsWAL(relation) && \
      !IsCatalogRelation(relation))

-/*
- * RelationGetPartitionKey
- *        Returns the PartitionKey of a relation
- */
-#define RelationGetPartitionKey(relation) ((relation)->rd_partkey)
-
-/*
- * RelationGetPartitionDesc
- *        Returns partition descriptor for a relation.
- */
-#define RelationGetPartitionDesc(relation) ((relation)->rd_partdesc)
-
 /* routines in utils/cache/relcache.c */
 extern void RelationIncrementReferenceCount(Relation rel);
 extern void RelationDecrementReferenceCount(Relation rel);
+extern struct PartitionKeyData *RelationGetPartitionKey(Relation rel);
+extern struct PartitionDescData *RelationGetPartitionDesc(Relation rel);

 #endif                            /* REL_H */
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index f630168..9d6d1ba 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -501,6 +501,22 @@ Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc')
 Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a
+1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b,
1,5) < 'ccccc'::text)))) 

 DROP TABLE partitioned, partitioned2;
+-- check reference to partitioned table's rowtype in partition descriptor
+create table partitioned (a int, b int)
+  partition by list ((row(a, b)::partitioned));
+create table partitioned1
+  partition of partitioned for values in ('(1,2)'::partitioned);
+create table partitioned2
+  partition of partitioned for values in ('(2,4)'::partitioned);
+explain (costs off)
+select * from partitioned where row(a,b)::partitioned = '(1,2)'::partitioned;
+                        QUERY PLAN
+-----------------------------------------------------------
+ Seq Scan on partitioned1 partitioned
+   Filter: (ROW(a, b)::partitioned = '(1,2)'::partitioned)
+(2 rows)
+
+drop table partitioned;
 -- check that dependencies of partition columns are handled correctly
 create domain intdom1 as int;
 create table partitioned (
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index e835b65..7ef646a 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -449,6 +449,17 @@ CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO

 DROP TABLE partitioned, partitioned2;

+-- check reference to partitioned table's rowtype in partition descriptor
+create table partitioned (a int, b int)
+  partition by list ((row(a, b)::partitioned));
+create table partitioned1
+  partition of partitioned for values in ('(1,2)'::partitioned);
+create table partitioned2
+  partition of partitioned for values in ('(2,4)'::partitioned);
+explain (costs off)
+select * from partitioned where row(a,b)::partitioned = '(1,2)'::partitioned;
+drop table partitioned;
+
 -- check that dependencies of partition columns are handled correctly
 create domain intdom1 as int;


pgsql-hackers by date:

Previous
From: Nikolay Samokhvalov
Date:
Subject: Re: [PATCH] Increase the maximum value track_activity_query_size
Next
From: Tom Lane
Date:
Subject: Re: [PATCH] Increase the maximum value track_activity_query_size