From 38a9ec23af2dc43ad24d939bb015d28d550d71fd Mon Sep 17 00:00:00 2001 From: Atsushi Torikoshi Date: Wed, 12 Mar 2025 21:47:22 +0900 Subject: [PATCH v1] Make it clear when hot standby is inaccessible due to subtransaction overflow Previously, the log message only assumed that the recovery process had not yet reached a consistent point. However, even after reaching the consistent point, if there is a transaction with an overflowed subtransaction, hot standby becomes inaccessible. Since there was no log message indicating this reason, it was difficult to identify the cause. This patch explicitly handles such cases, making the cause clearer in the logs. --- src/backend/postmaster/postmaster.c | 29 ++++++++++++++++++++++------- src/backend/storage/ipc/procarray.c | 17 +++++++++++++++++ src/backend/tcop/backend_startup.c | 13 +++++++++++++ src/include/storage/pmsignal.h | 2 ++ src/include/tcop/backend_startup.h | 1 + 5 files changed, 55 insertions(+), 7 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index d2a7a7add6..5c3de3f97d 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -333,6 +333,8 @@ typedef enum PM_INIT, /* postmaster starting */ PM_STARTUP, /* waiting for startup subprocess */ PM_RECOVERY, /* in archive recovery mode */ + PM_SNAPSHOT_PENDING, /* in snapshot pending because of an + * overflowed subtransaction */ PM_HOT_STANDBY, /* in hot standby mode */ PM_RUN, /* normal "database is alive" state */ PM_STOP_BACKENDS, /* need to stop remaining backends */ @@ -1814,6 +1816,9 @@ canAcceptConnections(BackendType backend_type) else if (!FatalError && pmState == PM_RECOVERY) return CAC_NOTCONSISTENT; /* not yet at consistent recovery * state */ + else if (!FatalError && pmState == PM_SNAPSHOT_PENDING) + return CAC_SNAPSHOT_PENDING; /* waiting for non-overflowed + * snapshot */ else return CAC_RECOVERY; /* else must be crash recovery */ } @@ -2111,7 +2116,7 @@ process_pm_shutdown_request(void) */ if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) connsAllowed = false; - else if (pmState == PM_STARTUP || pmState == PM_RECOVERY) + else if (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING) { /* There should be no clients, so proceed to stop children */ UpdatePMState(PM_STOP_BACKENDS); @@ -2145,7 +2150,7 @@ process_pm_shutdown_request(void) sd_notify(0, "STOPPING=1"); #endif - if (pmState == PM_STARTUP || pmState == PM_RECOVERY) + if (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING) { /* Just shut down background processes silently */ UpdatePMState(PM_STOP_BACKENDS); @@ -2711,6 +2716,7 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt) /* wait for children to die */ case PM_RECOVERY: + case PM_SNAPSHOT_PENDING: case PM_HOT_STANDBY: case PM_RUN: case PM_STOP_BACKENDS: @@ -3193,6 +3199,7 @@ pmstate_name(PMState state) PM_TOSTR_CASE(PM_INIT); PM_TOSTR_CASE(PM_STARTUP); PM_TOSTR_CASE(PM_RECOVERY); + PM_TOSTR_CASE(PM_SNAPSHOT_PENDING); PM_TOSTR_CASE(PM_HOT_STANDBY); PM_TOSTR_CASE(PM_RUN); PM_TOSTR_CASE(PM_STOP_BACKENDS); @@ -3245,7 +3252,7 @@ LaunchMissingBackgroundProcesses(void) * the shutdown checkpoint. That's done in PostmasterStateMachine(), not * here.) */ - if (pmState == PM_RUN || pmState == PM_RECOVERY || + if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING || pmState == PM_HOT_STANDBY || pmState == PM_STARTUP) { if (CheckpointerPMChild == NULL) @@ -3281,7 +3288,7 @@ LaunchMissingBackgroundProcesses(void) */ if (PgArchPMChild == NULL && ((XLogArchivingActive() && pmState == PM_RUN) || - (XLogArchivingAlways() && (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY))) && + (XLogArchivingAlways() && (pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING || pmState == PM_HOT_STANDBY))) && PgArchCanRestart()) PgArchPMChild = StartChildProcess(B_ARCHIVER); @@ -3313,7 +3320,7 @@ LaunchMissingBackgroundProcesses(void) if (WalReceiverRequested) { if (WalReceiverPMChild == NULL && - (pmState == PM_STARTUP || pmState == PM_RECOVERY || + (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING || pmState == PM_HOT_STANDBY) && Shutdown <= SmartShutdown) { @@ -3663,8 +3670,15 @@ process_pm_pmsignal(void) UpdatePMState(PM_RECOVERY); } - if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && + if (CheckPostmasterSignal(PMSIGNAL_SNAPSHOT_PENDING) && pmState == PM_RECOVERY && Shutdown == NoShutdown) + { + UpdatePMState(PM_SNAPSHOT_PENDING); + } + + if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && + (pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING) && + Shutdown == NoShutdown) { ereport(LOG, (errmsg("database system is ready to accept read-only connections"))); @@ -3806,7 +3820,7 @@ process_pm_pmsignal(void) } if (StartupPMChild != NULL && - (pmState == PM_STARTUP || pmState == PM_RECOVERY || + (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING || pmState == PM_HOT_STANDBY) && CheckPromoteSignal()) { @@ -4130,6 +4144,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time) /* fall through */ case PM_RECOVERY: + case PM_SNAPSHOT_PENDING: case PM_STARTUP: case PM_INIT: if (start_time == BgWorkerStart_PostmasterStart) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 2e54c11f88..bb37ad2fc2 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -58,6 +58,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "port/pg_lfind.h" +#include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procarray.h" #include "utils/acl.h" @@ -1125,11 +1126,19 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) "recovery snapshots are now enabled"); } else + { + /* + * Inform postmaster that we are waiting for a non-overflowed + * snapshot, so it can notify clients why the connection is + * not yet acceptable. + */ + SendPostmasterSignal(PMSIGNAL_SNAPSHOT_PENDING); elog(DEBUG1, "recovery snapshot waiting for non-overflowed snapshot or " "until oldest active xid on standby is at least %u (now %u)", standbySnapshotPendingXmin, running->oldestRunningXid); + } return; } } @@ -1303,11 +1312,19 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) if (standbyState == STANDBY_SNAPSHOT_READY) elog(DEBUG1, "recovery snapshots are now enabled"); else + { + /* + * Inform postmaster that we are waiting for a non-overflowed + * snapshot, so it can notify clients why the connection is not yet + * acceptable. + */ + SendPostmasterSignal(PMSIGNAL_SNAPSHOT_PENDING); elog(DEBUG1, "recovery snapshot waiting for non-overflowed snapshot or " "until oldest active xid on standby is at least %u (now %u)", standbySnapshotPendingXmin, running->oldestRunningXid); + } } /* diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c index c70746fa56..17e9708136 100644 --- a/src/backend/tcop/backend_startup.c +++ b/src/backend/tcop/backend_startup.c @@ -303,6 +303,19 @@ BackendInitialize(ClientSocket *client_sock, CAC_state cac) errmsg("the database system is not accepting connections"), errdetail("Hot standby mode is disabled."))); break; + case CAC_SNAPSHOT_PENDING: + if (EnableHotStandby) + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is not yet accepting connections"), + errdetail("Snapshot is pending because subtransaction is overflowed."), + errhint("Find and close a transaction with more than %d subtransactions", PGPROC_MAX_CACHED_SUBXIDS))); + else + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is not accepting connections"), + errdetail("Hot standby mode is disabled."))); + break; case CAC_SHUTDOWN: ereport(FATAL, (errcode(ERRCODE_CANNOT_CONNECT_NOW), diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index d84a383047..a67813a15b 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -33,6 +33,8 @@ typedef enum { PMSIGNAL_RECOVERY_STARTED, /* recovery has started */ + PMSIGNAL_SNAPSHOT_PENDING, /* snapshot is pending because of an + * overflowed subtransaction */ PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */ PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */ PMSIGNAL_START_AUTOVAC_LAUNCHER, /* start an autovacuum launcher */ diff --git a/src/include/tcop/backend_startup.h b/src/include/tcop/backend_startup.h index 7328561120..866a3b7cd2 100644 --- a/src/include/tcop/backend_startup.h +++ b/src/include/tcop/backend_startup.h @@ -30,6 +30,7 @@ typedef enum CAC_state CAC_SHUTDOWN, CAC_RECOVERY, CAC_NOTCONSISTENT, + CAC_SNAPSHOT_PENDING, CAC_TOOMANY, } CAC_state; -- 2.43.0