From e06e44ce9930793f5a0383580db8ebb3e9b6a6b4 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Tue, 31 Jan 2023 10:25:57 +0900 Subject: [PATCH v3] Fix a race condition of updating procArray->replication_slot_xmin. Previously, if already_locked is false, ReplicationSlotsComputeRequiredXmin() computed the oldest xmin across all slots while not holding the ProcArrayLock and acquires the ProcArrayLock just before updating the replication_slot_xmin. Therefore, it was possible that by the time a process computes the oldest xmin and before updating the replication_slot_xmin, another process computes and updates it. As a result, the replication_slot_xmin could be overwritten with an old value and retreated. In the reported failure, after a walsender who was creating a replication slot updated the replication_slot_xmin via CreateInitDecodingContext(), another walsender overwrote it with InvalidTransactionId. Then the walsender creating the replication slot ended up computing the oldest safe decoding transaction id without considering the replication_slot_xmin. That led to an error "cannot build an initial slot snapshot as oldest safe xid %u follows snapshot's xmin %u", which was an assertion failure prior to 240e0dbacd3. This commit changes ReplicationSlotsComputeRequiredXmin() so that it computes the oldest xmin while holding the ProcArrayLock in exclusive mode. We keep already_locked parameter in ProcArraySetReplicationSlotXmin() on backbranches to not break ABI compatibility. Discussion: https://postgr.es/m/CAA4eK1L8wYcyTPxNzPGkhuO52WBGoOZbT0A73Le=ZUWYAYmdfw@mail.gmail.com Backpatch-through: 11 --- src/backend/replication/slot.c | 15 ++++++++++++++- src/backend/storage/ipc/procarray.c | 13 +++---------- src/include/storage/procarray.h | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index f286918f69..063f6aa95c 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -840,6 +840,16 @@ ReplicationSlotsComputeRequiredXmin(bool already_locked) Assert(ReplicationSlotCtl != NULL); + /* + * It is possible that by the time we compute the agg_xmin here and before + * updating replication_slot_xmin, the CreateInitDecodingContext() will + * compute and update replication_slot_xmin. So, we need to acquire + * ProcArrayLock here to avoid retreating the value of + * replication_slot_xmin. + */ + if (!already_locked) + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) @@ -878,7 +888,10 @@ ReplicationSlotsComputeRequiredXmin(bool already_locked) LWLockRelease(ReplicationSlotControlLock); - ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked); + ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin); + + if (!already_locked) + LWLockRelease(ProcArrayLock); } /* diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 4340bf9641..a9e4f59440 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -3896,23 +3896,16 @@ TerminateOtherDBBackends(Oid databaseId) * * Install limits to future computations of the xmin horizon to prevent vacuum * and HOT pruning from removing affected rows still needed by clients with - * replication slots. + * replication slots. The caller must hold ProcArrayLock in exclusive mode. */ void -ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, - bool already_locked) +ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin) { - Assert(!already_locked || LWLockHeldByMe(ProcArrayLock)); - - if (!already_locked) - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE)); procArray->replication_slot_xmin = xmin; procArray->replication_slot_catalog_xmin = catalog_xmin; - if (!already_locked) - LWLockRelease(ProcArrayLock); - elog(DEBUG1, "xmin required by slots: data %u, catalog %u", xmin, catalog_xmin); } diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index d8cae3ce1c..b7554f1b53 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -91,7 +91,7 @@ extern void XidCacheRemoveRunningXids(TransactionId xid, TransactionId latestXid); extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, - TransactionId catalog_xmin, bool already_locked); + TransactionId catalog_xmin); extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin, TransactionId *catalog_xmin); -- 2.31.1