From 40ded2d62287aa8fe936f2e1ba4eafa42a11a905 Mon Sep 17 00:00:00 2001 From: Anthony Hsu Date: Sun, 22 Feb 2026 18:26:01 +0000 Subject: [PATCH v2] Set 1s WaitLatch timeout if standby limit has expired in ResolveRecoveryConflictWithBufferPin When the startup process tries to acquire a cleanup lock and encounters a buffer pin conflict, it calls ResolveRecoveryConflictWithBufferPin. The startup process will wait up to the standby limit before broadcasting a PROCSIG_RECOVERY_CONFLICT_BUFFERPIN signal. Backends receiving this signal will cancel themselves if they are holding a conflicting buffer pin. However, a possible race scenario is: 1. Suppose the standby limit has expired, so the startup process broadcasts the PROCSIG_RECOVERY_CONFLICT_BUFFERPIN signal. 2. A backend receives the signal, finds that it does NOT hold a conflicting buffer pin, and does nothing. 3. This backend (or a new backend) _then_ acquires a conflicting buffer pin. Previously, the startup process would not resend PROCSIG_RECOVERY_CONFLICT_BUFFERPIN to the backend even though standby limit has already expired. The startup process would just keep waiting until the pin count is reduced to 1 and finally UnpinBuffer() wakes it up. This patch introduces a 1s timeout to the WaitLatch that startup does if the standby limit has already expired. This ensures if new pinners arrive after startup has already broadcast the PROCSIG_RECOVERY_CONFLICT_BUFFERPIN signal, startup will rebroadcast the signal periodically so that the new pinners will be notified to cancel themselves if they are blocking startup. --- src/backend/storage/ipc/standby.c | 22 +++++++++++++++------- src/backend/storage/lmgr/proc.c | 19 ++++++++++++++++--- src/include/storage/proc.h | 2 ++ 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index d83afbfb9d6..49b57a8aa1d 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -793,12 +793,14 @@ void ResolveRecoveryConflictWithBufferPin(void) { TimestampTz ltime; + bool standby_limit_expired; Assert(InHotStandby); ltime = GetStandbyLimitTime(); + standby_limit_expired = GetCurrentTimestamp() >= ltime && ltime != 0; - if (GetCurrentTimestamp() >= ltime && ltime != 0) + if (standby_limit_expired) { /* * We're already behind, so clear a path as quickly as possible. @@ -833,14 +835,20 @@ ResolveRecoveryConflictWithBufferPin(void) /* * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted - * by one of the timeouts established above. + * by one of the timeouts established above. If the standby limit has + * already expired, we also set a 1s timeout. This ensures that if backends + * acquire conflicting pins *after* processing the + * PROCSIG_RECOVERY_CONFLICT_BUFFERPIN signal, the startup process will + * resend the signal to notify these backends to cancel their queries. * - * We assume that only UnpinBuffer() and the timeout requests established - * above can wake us up here. WakeupRecovery() called by walreceiver or - * SIGHUP signal handler, etc cannot do that because it uses the different - * latch from that ProcWaitForSignal() waits on. + * We assume that only UnpinBuffer(), the timeout requests established + * above, and the 1s timeout (if standby limit has expired) can wake us up + * here. WakeupRecovery() called by walreceiver or SIGHUP signal handler, + * etc cannot do that because it uses the different latch from that + * ProcWaitForSignalWithTimeout() waits on. */ - ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP); + ProcWaitForSignalWithTimeout(WAIT_EVENT_BUFFER_CLEANUP, + standby_limit_expired ? 1000 : 0); if (got_standby_delay_timeout) SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e2f34075d39..f2112baf710 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -1962,16 +1962,29 @@ GetLockHoldersAndWaiters(LOCALLOCK *locallock, StringInfo lock_holders_sbuf, /* * ProcWaitForSignal - wait for a signal from another backend. + */ +void +ProcWaitForSignal(uint32 wait_event_info) +{ + ProcWaitForSignalWithTimeout(wait_event_info, 0); +} + +/* + * ProcWaitForSignalWithTimeout - wait for a signal from another backend with a + * timeout. * * As this uses the generic process latch the caller has to be robust against * unrelated wakeups: Always check that the desired state has occurred, and * wait again if not. */ void -ProcWaitForSignal(uint32 wait_event_info) +ProcWaitForSignalWithTimeout(uint32 wait_event_info, long timeout_ms) { - (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, - wait_event_info); + int wake_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH; + if (timeout_ms > 0) + wake_events |= WL_TIMEOUT; + + (void) WaitLatch(MyLatch, wake_events, timeout_ms, wait_event_info); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); } diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index a8d2e7db1a1..c68698817a7 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -576,6 +576,8 @@ extern void GetLockHoldersAndWaiters(LOCALLOCK *locallock, int *lockHoldersNum); extern void ProcWaitForSignal(uint32 wait_event_info); +extern void ProcWaitForSignalWithTimeout(uint32 wait_event_info, + long timeout_ms); extern void ProcSendSignal(ProcNumber procNumber); extern PGPROC *AuxiliaryPidGetProc(int pid); -- 2.53.0.345.g96ddfc5eaa-goog