Re: BUG #16378: Invalid memory access on interrupting CLUSTER after CREATE TEMP TABLE - Mailing list pgsql-bugs

From Tom Lane
Subject Re: BUG #16378: Invalid memory access on interrupting CLUSTER after CREATE TEMP TABLE
Date
Msg-id 11537.1587408480@sss.pgh.pa.us
Whole thread Raw
In response to Re: BUG #16378: Invalid memory access on interrupting CLUSTER after CREATE TEMP TABLE  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-bugs
I wrote:
> The difficulty is that the pendingReindexedIndexes list is kept in
> some transaction-local context, so it gets flushed during the transaction
> abort that is the first step of proc_exit processing.  But the static
> pointer to it is still set, causing big problems if we do any system
> catalog accesses later --- like, say, while dropping the session's
> temp tables.

> One idea would be to keep the list in TopMemoryContext, but that feels
> like a band-aid solution.  I think more likely what we ought to do is
> stop trying to use a PG_TRY in reindex_relation to drive cleanup,
> and instead hook ResetReindexPending into transaction abort processing
> honestly.

I spent some more time looking at this.  The "band-aid solution" is no
solution at all; consider the situation where we have some temp tables
and we get a SIGTERM while doing a REINDEX on pg_class.  If we don't
reset the reindex state then our attempts to access pg_class indexes
while clearing out temp tables will fail.

There seem to be two distinct routes to a non-band-aid solution:

1. Instead of PG_TRY, use PG_ENSURE_ERROR_CLEANUP to ensure the reindex
state gets cleared.  This will work since the cleanup code will get
run before any other proc_exit callbacks.  However, the fact that the
reindex state is getting propagated to parallel workers makes me fear
that this solution is inadequate, because the parallel workers won't
have any such cleanup callback.  If they try to do any catalog access
during transaction abort they might have problems.  (Parallel reindex
of a system catalog is mind-bendingly scary anyway, but I don't want
this patch to make it worse.)

2. Clear the reindex state early in transaction abort, as suggested above
and as implemented by the patch below.  This adds a few cycles to xact
abort, which is what the previous coding was trying to avoid, but I think
we are just stuck with doing that.

Thoughts?

            regards, tom lane

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 6b1ae1f..a979256 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -30,6 +30,7 @@
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "access/xlogutils.h"
+#include "catalog/index.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_enum.h"
 #include "catalog/storage.h"
@@ -2662,6 +2663,9 @@ AbortTransaction(void)
      */
     SetUserIdAndSecContext(s->prevUser, s->prevSecContext);

+    /* Forget about any active REINDEX. */
+    ResetReindexState();
+
     /* If in parallel mode, clean up workers and exit parallel mode. */
     if (IsInParallelMode())
     {
@@ -4961,6 +4965,9 @@ AbortSubTransaction(void)
      */
     SetUserIdAndSecContext(s->prevUser, s->prevSecContext);

+    /* Forget about any active REINDEX. */
+    ResetReindexState();
+
     /* Exit from parallel mode, if necessary. */
     if (IsInParallelMode())
     {
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index bd7ec92..321d9d6 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -131,7 +131,6 @@ static void SetReindexProcessing(Oid heapOid, Oid indexOid);
 static void ResetReindexProcessing(void);
 static void SetReindexPending(List *indexes);
 static void RemoveReindexPending(Oid indexOid);
-static void ResetReindexPending(void);


 /*
@@ -3525,9 +3524,6 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence,
         indexInfo->ii_ExclusionStrats = NULL;
     }

-    /* ensure SetReindexProcessing state isn't leaked */
-    PG_TRY();
-    {
         /* Suppress use of the target index while rebuilding it */
         SetReindexProcessing(heapId, indexId);

@@ -3537,13 +3533,9 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence,
         /* Initialize the index and rebuild */
         /* Note: we do not need to re-establish pkey setting */
         index_build(heapRelation, iRel, indexInfo, true, true);
-    }
-    PG_FINALLY();
-    {
-        /* Make sure flag gets cleared on error exit */
+
+        /* Re-allow use of target index */
         ResetReindexProcessing();
-    }
-    PG_END_TRY();

     /*
      * If the index is marked invalid/not-ready/dead (ie, it's from a failed
@@ -3715,7 +3707,6 @@ reindex_relation(Oid relid, int flags, int options)
      */
     indexIds = RelationGetIndexList(rel);

-    PG_TRY();
     {
         ListCell   *indexId;
         char        persistence;
@@ -3780,12 +3771,6 @@ reindex_relation(Oid relid, int flags, int options)
             i++;
         }
     }
-    PG_FINALLY();
-    {
-        /* Make sure list gets cleared on error exit */
-        ResetReindexPending();
-    }
-    PG_END_TRY();

     /*
      * Close rel, but continue to hold the lock.
@@ -3819,6 +3804,7 @@ reindex_relation(Oid relid, int flags, int options)
 static Oid    currentlyReindexedHeap = InvalidOid;
 static Oid    currentlyReindexedIndex = InvalidOid;
 static List *pendingReindexedIndexes = NIL;
+static int    reindexingNestLevel = 0;

 /*
  * ReindexIsProcessingHeap
@@ -3855,8 +3841,6 @@ ReindexIsProcessingIndex(Oid indexOid)
 /*
  * SetReindexProcessing
  *        Set flag that specified heap/index are being reindexed.
- *
- * NB: caller must use a PG_TRY block to ensure ResetReindexProcessing is done.
  */
 static void
 SetReindexProcessing(Oid heapOid, Oid indexOid)
@@ -3869,6 +3853,8 @@ SetReindexProcessing(Oid heapOid, Oid indexOid)
     currentlyReindexedIndex = indexOid;
     /* Index is no longer "pending" reindex. */
     RemoveReindexPending(indexOid);
+    /* This may have been set already, but in case it isn't, do so now. */
+    reindexingNestLevel = GetCurrentTransactionNestLevel();
 }

 /*
@@ -3878,17 +3864,16 @@ SetReindexProcessing(Oid heapOid, Oid indexOid)
 static void
 ResetReindexProcessing(void)
 {
-    /* This may be called in leader error path */
     currentlyReindexedHeap = InvalidOid;
     currentlyReindexedIndex = InvalidOid;
+    /* reindexingNestLevel remains set till end of (sub)transaction */
 }

 /*
  * SetReindexPending
  *        Mark the given indexes as pending reindex.
  *
- * NB: caller must use a PG_TRY block to ensure ResetReindexPending is done.
- * Also, we assume that the current memory context stays valid throughout.
+ * NB: we assume that the current memory context stays valid throughout.
  */
 static void
 SetReindexPending(List *indexes)
@@ -3899,6 +3884,7 @@ SetReindexPending(List *indexes)
     if (IsInParallelMode())
         elog(ERROR, "cannot modify reindex state during a parallel operation");
     pendingReindexedIndexes = list_copy(indexes);
+    reindexingNestLevel = GetCurrentTransactionNestLevel();
 }

 /*
@@ -3915,14 +3901,32 @@ RemoveReindexPending(Oid indexOid)
 }

 /*
- * ResetReindexPending
- *        Unset reindex-pending status.
+ * ResetReindexState
+ *        Clear all reindexing state during (sub)transaction abort.
  */
-static void
-ResetReindexPending(void)
+void
+ResetReindexState(void)
 {
-    /* This may be called in leader error path */
-    pendingReindexedIndexes = NIL;
+    /*
+     * Because reindexing is not re-entrant, we don't need to cope with nested
+     * reindexing states.  We just need to avoid messing up the outer-level
+     * state in case a subtransaction fails within a REINDEX.  So checking the
+     * current nest level against that of the reindex operation is sufficient.
+     */
+    if (reindexingNestLevel >= GetCurrentTransactionNestLevel())
+    {
+        currentlyReindexedHeap = InvalidOid;
+        currentlyReindexedIndex = InvalidOid;
+
+        /*
+         * We needn't try to release the contents of pendingReindexedIndexes;
+         * that list should be in a transaction-lifespan context, so it will
+         * go away automatically.
+         */
+        pendingReindexedIndexes = NIL;
+
+        reindexingNestLevel = 0;
+    }
 }

 /*
@@ -3975,4 +3979,7 @@ RestoreReindexState(void *reindexstate)
             lappend_oid(pendingReindexedIndexes,
                         sistate->pendingReindexedIndexes[c]);
     MemoryContextSwitchTo(oldcontext);
+
+    /* Note the worker has its own transaction nesting level */
+    reindexingNestLevel = GetCurrentTransactionNestLevel();
 }
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index a2890c1..887d300 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -131,6 +131,8 @@ extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);

 extern void index_set_state_flags(Oid indexId, IndexStateFlagsAction action);

+extern Oid    IndexGetRelation(Oid indexId, bool missing_ok);
+
 extern void reindex_index(Oid indexId, bool skip_constraint_checks,
                           char relpersistence, int options);

@@ -145,8 +147,8 @@ extern bool reindex_relation(Oid relid, int flags, int options);

 extern bool ReindexIsProcessingHeap(Oid heapOid);
 extern bool ReindexIsProcessingIndex(Oid indexOid);
-extern Oid    IndexGetRelation(Oid indexId, bool missing_ok);

+extern void ResetReindexState(void);
 extern Size EstimateReindexStateSpace(void);
 extern void SerializeReindexState(Size maxsize, char *start_address);
 extern void RestoreReindexState(void *reindexstate);

pgsql-bugs by date:

Previous
From: "David G. Johnston"
Date:
Subject: Re: NOTIFY in multi-statement PQexec() not sent outside of transaction
Next
From: Andres Freund
Date:
Subject: Re: BUG #16112: large, unexpected memory consumption