From f413cac950920bfdaf22dacd4762aec4810c734f Mon Sep 17 00:00:00 2001 From: alterego655 <824662526@qq.com> Date: Fri, 31 Oct 2025 18:17:48 +0800 Subject: [PATCH v3 1/3] Fix unconditional walreceiver shutdown during stream-archive transition Commit 3635a0a introduced unconditional walreceiver shutdown when switching from streaming to archive WAL sources. This causes problems during timeline divergence, when walreceiver enters WALRCV_WAITING state but remains alive. The unconditional shutdown breaks monitoring: walreceiver gets repeatedly terminated and respawned, causing pg_stat_wal_receiver.status to show 'streaming' instead of 'waiting', masking the underlying replication problem. In worst cases with synchronous replication, this can lead to unwritable clusters when the standby reports false readiness. Fix by making the shutdown conditional: only terminate walreceiver when it's actively streaming (WALRCV_STREAMING, WALRCV_STARTING, or WALRCV_RESTARTING). When in WALRCV_WAITING state, just reset InstallXLogFileSegmentActive flag to allow archive restoration without killing the process. --- src/backend/access/transam/xlog.c | 14 ++++++++++---- src/backend/access/transam/xlogrecovery.c | 13 ++++++++++++- src/include/access/xlog.h | 1 + src/test/recovery/t/004_timeline_switch.pl | 5 +++++ .../recovery/t/040_standby_failover_slots_sync.pl | 13 +++++++++++++ 5 files changed, 41 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index fd91bcd68ec..30a8faeb331 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9514,17 +9514,23 @@ GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli) LWLockRelease(ControlFileLock); } -/* Thin wrapper around ShutdownWalRcv(). */ +/* Reset the InstallXLogFileSegmentActive flag without shutting down walreceiver. */ void -XLogShutdownWalRcv(void) +ResetInstallXLogFileSegmentActive(void) { - ShutdownWalRcv(); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); XLogCtl->InstallXLogFileSegmentActive = false; LWLockRelease(ControlFileLock); } +/* Thin wrapper around ShutdownWalRcv(). */ +void +XLogShutdownWalRcv(void) +{ + ShutdownWalRcv(); + ResetInstallXLogFileSegmentActive(); +} + /* Enable WAL file recycling and preallocation. */ void SetInstallXLogFileSegmentActive(void) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 3e3c4da01a2..d6be98c6e62 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -3687,8 +3687,19 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * Before we leave XLOG_FROM_STREAM state, make sure that * walreceiver is not active, so that it won't overwrite * WAL that we restore from archive. + * + * If walreceiver is actively streaming (or attempting to + * connect), we must shut it down. However, if it's already + * in WAITING state (e.g., due to timeline divergence), we + * only need to reset the install flag to allow archive + * restoration. */ - XLogShutdownWalRcv(); + if (WalRcvStreaming()) + XLogShutdownWalRcv(); + else + { + ResetInstallXLogFileSegmentActive(); + } /* * Before we sleep, re-scan for possible new timelines if diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index a12757e46e5..605280ed8fb 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -269,6 +269,7 @@ extern void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI extern void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli); extern void SetInstallXLogFileSegmentActive(void); extern bool IsInstallXLogFileSegmentActive(void); +extern void ResetInstallXLogFileSegmentActive(void); extern void XLogShutdownWalRcv(void); /* diff --git a/src/test/recovery/t/004_timeline_switch.pl b/src/test/recovery/t/004_timeline_switch.pl index 9c8334cf278..8baa4f2d5fe 100644 --- a/src/test/recovery/t/004_timeline_switch.pl +++ b/src/test/recovery/t/004_timeline_switch.pl @@ -66,6 +66,11 @@ my $result = $node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int"); is($result, qq(2000), 'check content of standby 2'); +# Check the logs, WAL receiver should not have been stopped. There is no need +# to rely on a position in the logs: a new log file is used on node restart. +ok( !$node_standby_2->log_contains( + "FATAL: .* terminating walreceiver process due to administrator command"), + 'WAL receiver should not be stopped across timeline jumps'); # Ensure that a standby is able to follow a primary on a newer timeline # when WAL archiving is enabled. diff --git a/src/test/recovery/t/040_standby_failover_slots_sync.pl b/src/test/recovery/t/040_standby_failover_slots_sync.pl index 3059bb8177b..1c99d354e7a 100644 --- a/src/test/recovery/t/040_standby_failover_slots_sync.pl +++ b/src/test/recovery/t/040_standby_failover_slots_sync.pl @@ -344,8 +344,21 @@ like( ################################################## $standby1->append_conf('postgresql.conf', "primary_conninfo = '$connstr_1'"); + +# Capture the log position before reload to check for walreceiver termination +$log_offset = -s $standby1->logfile; + $standby1->reload; +# Wait for walreceiver to be stopped and restarted after config reload. +# When primary_conninfo changes, walreceiver should be terminated and +# a new one spawned. +$standby1->wait_for_log( + qr/FATAL: .* terminating walreceiver process due to administrator command/, + $log_offset); + +ok(1, 'walreceiver correctly terminated after primary_conninfo change'); + ($result, $stdout, $stderr) = $standby1->psql('postgres', "SELECT pg_sync_replication_slots();"); like( -- 2.51.0