Re: Startup process on a hot standby crashes with an error "invalid memory alloc request size 1073741824" while replaying "Standby/LOCK" records - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Startup process on a hot standby crashes with an error "invalid memory alloc request size 1073741824" while replaying "Standby/LOCK" records
Date
Msg-id 2208668.1664927591@sss.pgh.pa.us
Whole thread Raw
In response to Re: Startup process on a hot standby crashes with an error "invalid memory alloc request size 1073741824" while replaying "Standby/LOCK" records  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: Startup process on a hot standby crashes with an error "invalid memory alloc request size 1073741824" while replaying "Standby/LOCK" records  (Nathan Bossart <nathandbossart@gmail.com>)
List pgsql-hackers
I wrote:
> PFA a quick-hack fix that solves this issue by making per-transaction
> subsidiary hash tables.  That's overkill perhaps; I'm a little worried
> about whether this slows down normal cases more than it's worth.
> But we ought to do something about this, because aside from the
> duplication aspect the current storage of these lists seems mighty
> space-inefficient.

After further thought, maybe it'd be better to do it as attached,
with one long-lived hash table for all the locks.  This is a shade
less space-efficient than the current code once you account for
dynahash overhead, but the per-transaction overhead should be lower
than the previous patch since we only need to create/destroy a hash
table entry not a whole hash table.

            regards, tom lane

diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 9dab931990..7db86f7885 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -42,7 +42,29 @@ int            max_standby_archive_delay = 30 * 1000;
 int            max_standby_streaming_delay = 30 * 1000;
 bool        log_recovery_conflict_waits = false;

-static HTAB *RecoveryLockLists;
+/*
+ * Keep track of all the exclusive locks owned by original transactions.
+ * For each known exclusive lock, there is a RecoveryLockEntry in the
+ * RecoveryLockHash hash table.  All RecoveryLockEntrys belonging to a
+ * given XID are chained together so that we can find them easily.
+ * For each original transaction that is known to have any such locks,
+ * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
+ * which stores the head of the chain of its locks.
+ */
+typedef struct RecoveryLockEntry
+{
+    xl_standby_lock key;        /* hash key: xid, dbOid, relOid */
+    struct RecoveryLockEntry *next; /* chain link */
+} RecoveryLockEntry;
+
+typedef struct RecoveryLockXidEntry
+{
+    TransactionId xid;            /* hash key -- must be first */
+    struct RecoveryLockEntry *head; /* chain head */
+} RecoveryLockXidEntry;
+
+static HTAB *RecoveryLockHash = NULL;
+static HTAB *RecoveryLockXidHash = NULL;

 /* Flags set by timeout handlers */
 static volatile sig_atomic_t got_standby_deadlock_timeout = false;
@@ -58,15 +80,6 @@ static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
 static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
 static const char *get_recovery_conflict_desc(ProcSignalReason reason);

-/*
- * Keep track of all the locks owned by a given transaction.
- */
-typedef struct RecoveryLockListsEntry
-{
-    TransactionId xid;
-    List       *locks;
-} RecoveryLockListsEntry;
-
 /*
  * InitRecoveryTransactionEnvironment
  *        Initialize tracking of our primary's in-progress transactions.
@@ -85,16 +98,24 @@ InitRecoveryTransactionEnvironment(void)
     VirtualTransactionId vxid;
     HASHCTL        hash_ctl;

+    Assert(RecoveryLockHash == NULL);    /* don't run this twice */
+
     /*
-     * Initialize the hash table for tracking the list of locks held by each
+     * Initialize the hash tables for tracking the locks held by each
      * transaction.
      */
+    hash_ctl.keysize = sizeof(xl_standby_lock);
+    hash_ctl.entrysize = sizeof(RecoveryLockEntry);
+    RecoveryLockHash = hash_create("RecoveryLockHash",
+                                   64,
+                                   &hash_ctl,
+                                   HASH_ELEM | HASH_BLOBS);
     hash_ctl.keysize = sizeof(TransactionId);
-    hash_ctl.entrysize = sizeof(RecoveryLockListsEntry);
-    RecoveryLockLists = hash_create("RecoveryLockLists",
-                                    64,
-                                    &hash_ctl,
-                                    HASH_ELEM | HASH_BLOBS);
+    hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
+    RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
+                                      64,
+                                      &hash_ctl,
+                                      HASH_ELEM | HASH_BLOBS);

     /*
      * Initialize shared invalidation management for Startup process, being
@@ -140,12 +161,12 @@ void
 ShutdownRecoveryTransactionEnvironment(void)
 {
     /*
-     * Do nothing if RecoveryLockLists is NULL because which means that
-     * transaction tracking has not been yet initialized or has been already
-     * shutdowned. This prevents transaction tracking from being shutdowned
-     * unexpectedly more than once.
+     * Do nothing if RecoveryLockHash is NULL because that means that
+     * transaction tracking has not yet been initialized or has already been
+     * shut down.  This makes it safe to have possibly-redundant calls of this
+     * function during process exit.
      */
-    if (RecoveryLockLists == NULL)
+    if (RecoveryLockHash == NULL)
         return;

     /* Mark all tracked in-progress transactions as finished. */
@@ -154,9 +175,11 @@ ShutdownRecoveryTransactionEnvironment(void)
     /* Release all locks the tracked transactions were holding */
     StandbyReleaseAllLocks();

-    /* Destroy the hash table of locks. */
-    hash_destroy(RecoveryLockLists);
-    RecoveryLockLists = NULL;
+    /* Destroy the lock hash tables. */
+    hash_destroy(RecoveryLockHash);
+    hash_destroy(RecoveryLockXidHash);
+    RecoveryLockHash = NULL;
+    RecoveryLockXidHash = NULL;

     /* Cleanup our VirtualTransaction */
     VirtualXactLockTableCleanup();
@@ -932,12 +955,12 @@ StandbyLockTimeoutHandler(void)
  * We only keep track of AccessExclusiveLocks, which are only ever held by
  * one transaction on one relation.
  *
- * We keep a hash table of lists of locks in local memory keyed by xid,
- * RecoveryLockLists, so we can keep track of the various entries made by
- * the Startup process's virtual xid in the shared lock table.
- *
- * List elements use type xl_standby_lock, since the WAL record type exactly
- * matches the information that we need to keep track of.
+ * We keep a table of known locks in the RecoveryLockHash hash table.
+ * The point of that table is to let us efficiently de-duplicate locks,
+ * which is important because checkpoints will re-report the same locks
+ * already held.  There is also a RecoveryLockXidHash table with one entry
+ * per xid, which allows us to efficiently find all the locks held by a
+ * given original transaction.
  *
  * We use session locks rather than normal locks so we don't need
  * ResourceOwners.
@@ -947,8 +970,9 @@ StandbyLockTimeoutHandler(void)
 void
 StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
 {
-    RecoveryLockListsEntry *entry;
-    xl_standby_lock *newlock;
+    RecoveryLockXidEntry *xidentry;
+    RecoveryLockEntry *lockentry;
+    xl_standby_lock key;
     LOCKTAG        locktag;
     bool        found;

@@ -964,62 +988,79 @@ StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
     /* dbOid is InvalidOid when we are locking a shared relation. */
     Assert(OidIsValid(relOid));

-    /* Create a new list for this xid, if we don't have one already. */
-    entry = hash_search(RecoveryLockLists, &xid, HASH_ENTER, &found);
+    /* Create a hash entry for this xid, if we don't have one already. */
+    xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
     if (!found)
     {
-        entry->xid = xid;
-        entry->locks = NIL;
+        Assert(xidentry->xid == xid);    /* dynahash should have set this */
+        xidentry->head = NULL;
     }

-    newlock = palloc(sizeof(xl_standby_lock));
-    newlock->xid = xid;
-    newlock->dbOid = dbOid;
-    newlock->relOid = relOid;
-    entry->locks = lappend(entry->locks, newlock);
+    /* Create a hash entry for this lock, unless we have one already. */
+    key.xid = xid;
+    key.dbOid = dbOid;
+    key.relOid = relOid;
+    lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
+    if (!found)
+    {
+        /* It's new, so link it into the XID's list ... */
+        lockentry->next = xidentry->head;
+        xidentry->head = lockentry;

-    SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
+        /* ... and acquire the lock locally. */
+        SET_LOCKTAG_RELATION(locktag, dbOid, relOid);

-    (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
+        (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
+    }
 }

+/*
+ * Release all the locks associated with this RecoveryLockXidEntry.
+ */
 static void
-StandbyReleaseLockList(List *locks)
+StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
 {
-    ListCell   *lc;
+    RecoveryLockEntry *entry;
+    RecoveryLockEntry *next;

-    foreach(lc, locks)
+    for (entry = xidentry->head; entry != NULL; entry = next)
     {
-        xl_standby_lock *lock = (xl_standby_lock *) lfirst(lc);
         LOCKTAG        locktag;

         elog(trace_recovery(DEBUG4),
              "releasing recovery lock: xid %u db %u rel %u",
-             lock->xid, lock->dbOid, lock->relOid);
-        SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
+             entry->key.xid, entry->key.dbOid, entry->key.relOid);
+        /* Release the lock ... */
+        SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
         if (!LockRelease(&locktag, AccessExclusiveLock, true))
         {
             elog(LOG,
-                 "RecoveryLockLists contains entry for lock no longer recorded by lock manager: xid %u database %u
relation%u", 
-                 lock->xid, lock->dbOid, lock->relOid);
+                 "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u
relation%u", 
+                 entry->key.xid, entry->key.dbOid, entry->key.relOid);
             Assert(false);
         }
+        /* ... and remove the per-lock hash entry */
+        next = entry->next;
+        hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
     }

-    list_free_deep(locks);
+    xidentry->head = NULL;        /* just for paranoia */
 }

+/*
+ * Release locks for specific XID, or all locks if it's InvalidXid.
+ */
 static void
 StandbyReleaseLocks(TransactionId xid)
 {
-    RecoveryLockListsEntry *entry;
+    RecoveryLockXidEntry *entry;

     if (TransactionIdIsValid(xid))
     {
-        if ((entry = hash_search(RecoveryLockLists, &xid, HASH_FIND, NULL)))
+        if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
         {
-            StandbyReleaseLockList(entry->locks);
-            hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+            StandbyReleaseXidEntryLocks(entry);
+            hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
         }
     }
     else
@@ -1028,7 +1069,7 @@ StandbyReleaseLocks(TransactionId xid)

 /*
  * Release locks for a transaction tree, starting at xid down, from
- * RecoveryLockLists.
+ * RecoveryLockXidHash.
  *
  * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
  * to remove any AccessExclusiveLocks requested by a transaction.
@@ -1051,15 +1092,15 @@ void
 StandbyReleaseAllLocks(void)
 {
     HASH_SEQ_STATUS status;
-    RecoveryLockListsEntry *entry;
+    RecoveryLockXidEntry *entry;

     elog(trace_recovery(DEBUG2), "release all standby locks");

-    hash_seq_init(&status, RecoveryLockLists);
+    hash_seq_init(&status, RecoveryLockXidHash);
     while ((entry = hash_seq_search(&status)))
     {
-        StandbyReleaseLockList(entry->locks);
-        hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+        StandbyReleaseXidEntryLocks(entry);
+        hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
     }
 }

@@ -1072,9 +1113,9 @@ void
 StandbyReleaseOldLocks(TransactionId oldxid)
 {
     HASH_SEQ_STATUS status;
-    RecoveryLockListsEntry *entry;
+    RecoveryLockXidEntry *entry;

-    hash_seq_init(&status, RecoveryLockLists);
+    hash_seq_init(&status, RecoveryLockXidHash);
     while ((entry = hash_seq_search(&status)))
     {
         Assert(TransactionIdIsValid(entry->xid));
@@ -1088,8 +1129,8 @@ StandbyReleaseOldLocks(TransactionId oldxid)
             continue;

         /* Remove all locks and hash table entry. */
-        StandbyReleaseLockList(entry->locks);
-        hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+        StandbyReleaseXidEntryLocks(entry);
+        hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
     }
 }


pgsql-hackers by date:

Previous
From: Andres Freund
Date:
Subject: Re: problems with making relfilenodes 56-bits
Next
From: Nathan Bossart
Date:
Subject: Re: Startup process on a hot standby crashes with an error "invalid memory alloc request size 1073741824" while replaying "Standby/LOCK" records