From cf5de7abe443acc311f4880499be520f00a7bf5e Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Thu, 23 Apr 2026 10:44:22 +0200 Subject: [PATCH 4/8] Fix invalid checksum state transition in checkpoints Commit 78e950cb8 added checksum state handling to all XLOG_CHECKPOINT records which caused unnecessary state transitions and emission of procsignal barriers. Remove as only the _REDO record need to handle checksum state. Barrier emission is also consistently made after controlfile updates to avoid race conditions. Additionelly, interrupts are held between calling ProcSignalInit and InitLocalDataChecksumState to remove a window where otherwise invalid state transitions can happen. Also remove a pointless assertion on Controlfile which will never hit. Author: Tomas Vondra Author: Daniel Gustafsson Discussion: https://postgr.es/m/xxx --- src/backend/access/transam/xlog.c | 106 ++++++-------------- src/backend/postmaster/auxprocess.c | 10 ++ src/backend/postmaster/datachecksum_state.c | 8 +- src/backend/utils/init/postinit.c | 10 ++ 4 files changed, 55 insertions(+), 79 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f74d7a2ab1a..9b1a10dbd1c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4723,8 +4723,6 @@ SetDataChecksumsOnInProgress(void) { uint64 barrier; - Assert(ControlFile != NULL); - /* * The state transition is performed in a critical section with * checkpoints held off to provide crash safety. @@ -4738,25 +4736,16 @@ SetDataChecksumsOnInProgress(void) XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON; SpinLockRelease(&XLogCtl->info_lck); - barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); - - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; - END_CRIT_SECTION(); - - /* - * Update the controlfile before waiting since if we have an immediate - * shutdown while waiting we want to come back up with checksums enabled. - */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON; UpdateControlFile(); LWLockRelease(ControlFileLock); - /* - * Await state change in all backends to ensure that all backends are in - * "inprogress-on". Once done we know that all backends are writing data - * checksums. - */ + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + WaitForProcSignalBarrier(barrier); } @@ -4787,8 +4776,6 @@ SetDataChecksumsOn(void) { uint64 barrier; - Assert(ControlFile != NULL); - SpinLockAcquire(&XLogCtl->info_lck); /* @@ -4818,11 +4805,6 @@ SetDataChecksumsOn(void) XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION; SpinLockRelease(&XLogCtl->info_lck); - barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); - - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; - END_CRIT_SECTION(); - /* * Update the controlfile before waiting since if we have an immediate * shutdown while waiting we want to come back up with checksums enabled. @@ -4832,12 +4814,12 @@ SetDataChecksumsOn(void) UpdateControlFile(); LWLockRelease(ControlFileLock); - RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); - /* - * Await state transition to "on" in all backends. When done we know that - * data checksums are both written and verified in all backends. - */ + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); WaitForProcSignalBarrier(barrier); } @@ -4859,8 +4841,6 @@ SetDataChecksumsOff(void) { uint64 barrier; - Assert(ControlFile != NULL); - SpinLockAcquire(&XLogCtl->info_lck); /* If data checksums are already disabled there is nothing to do */ @@ -4891,22 +4871,17 @@ SetDataChecksumsOff(void) XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF; SpinLockRelease(&XLogCtl->info_lck); + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF; - UpdateControlFile(); - LWLockRelease(ControlFileLock); - RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); - - /* - * Update local state in all backends to ensure that any backend in - * "on" state is changed to "inprogress-off". - */ WaitForProcSignalBarrier(barrier); /* @@ -4935,18 +4910,17 @@ SetDataChecksumsOff(void) XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_OFF; SpinLockRelease(&XLogCtl->info_lck); - barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); - - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; - END_CRIT_SECTION(); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF; UpdateControlFile(); LWLockRelease(ControlFileLock); - RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); WaitForProcSignalBarrier(barrier); } @@ -4961,6 +4935,7 @@ SetDataChecksumsOff(void) void InitLocalDataChecksumState(void) { + Assert(InterruptHoldoffCount > 0); SpinLockAcquire(&XLogCtl->info_lck); SetLocalDataChecksumState(XLogCtl->data_checksum_version); SpinLockRelease(&XLogCtl->info_lck); @@ -5427,7 +5402,6 @@ XLOGShmemInit(void *arg) /* Use the checksum info from control file */ XLogCtl->data_checksum_version = ControlFile->data_checksum_version; - SetLocalDataChecksumState(XLogCtl->data_checksum_version); SpinLockInit(&XLogCtl->Insert.insertpos_lck); @@ -7518,7 +7492,9 @@ CreateCheckPoint(int flags) * Get the current data_checksum_version value from xlogctl, valid at the * time of the checkpoint. */ + SpinLockAcquire(&XLogCtl->info_lck); checkPoint.dataChecksumState = XLogCtl->data_checksum_version; + SpinLockRelease(&XLogCtl->info_lck); if (shutdown) { @@ -7639,10 +7615,6 @@ CreateCheckPoint(int flags) checkPoint.nextOid += TransamVariables->oidCount; LWLockRelease(OidGenLock); - SpinLockAcquire(&XLogCtl->info_lck); - checkPoint.dataChecksumState = XLogCtl->data_checksum_version; - SpinLockRelease(&XLogCtl->info_lck); - checkPoint.logicalDecodingEnabled = IsLogicalDecodingEnabled(); MultiXactGetCheckptMulti(shutdown, @@ -7792,9 +7764,6 @@ CreateCheckPoint(int flags) ControlFile->minRecoveryPoint = InvalidXLogRecPtr; ControlFile->minRecoveryPointTLI = 0; - /* make sure we start with the checksum version as of the checkpoint */ - ControlFile->data_checksum_version = checkPoint.dataChecksumState; - /* * Persist unloggedLSN value. It's reset on crash recovery, so this goes * unused on non-shutdown checkpoints, but seems useful to store it always @@ -8871,11 +8840,6 @@ xlog_redo(XLogReaderState *record) MultiXactAdvanceOldest(checkPoint.oldestMulti, checkPoint.oldestMultiDB); - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->data_checksum_version = checkPoint.dataChecksumState; - SetLocalDataChecksumState(checkPoint.dataChecksumState); - SpinLockRelease(&XLogCtl->info_lck); - /* * No need to set oldestClogXid here as well; it'll be set when we * redo an xl_clog_truncate if it changed since initialization. @@ -8936,6 +8900,8 @@ xlog_redo(XLogReaderState *record) LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; ControlFile->data_checksum_version = checkPoint.dataChecksumState; + + UpdateControlFile(); LWLockRelease(ControlFileLock); /* @@ -8962,8 +8928,6 @@ xlog_redo(XLogReaderState *record) { CheckPoint checkPoint; TimeLineID replayTLI; - bool new_state = false; - int old_state; memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); /* In an ONLINE checkpoint, treat the XID counter as a minimum */ @@ -9002,8 +8966,6 @@ xlog_redo(XLogReaderState *record) /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; - old_state = ControlFile->data_checksum_version; - ControlFile->data_checksum_version = checkPoint.dataChecksumState; LWLockRelease(ControlFileLock); /* TLI should not change in an on-line checkpoint */ @@ -9015,18 +8977,6 @@ xlog_redo(XLogReaderState *record) RecoveryRestartPoint(&checkPoint, record); - /* - * If the data checksum state change we need to emit a barrier. - */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->data_checksum_version = checkPoint.dataChecksumState; - if (checkPoint.dataChecksumState != old_state) - new_state = true; - SpinLockRelease(&XLogCtl->info_lck); - - if (new_state) - EmitAndWaitDataChecksumsBarrier(checkPoint.dataChecksumState); - /* * After replaying a checkpoint record, free all smgr objects. * Otherwise we would never do so for dropped relations, as the @@ -9195,6 +9145,7 @@ xlog_redo(XLogReaderState *record) SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->data_checksum_version = redo_rec.data_checksum_version; + SetLocalDataChecksumState(redo_rec.data_checksum_version); if (redo_rec.data_checksum_version != ControlFile->data_checksum_version) new_state = true; SpinLockRelease(&XLogCtl->info_lck); @@ -9268,6 +9219,11 @@ xlog2_redo(XLogReaderState *record) XLogCtl->data_checksum_version = state.new_checksum_state; SpinLockRelease(&XLogCtl->info_lck); + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = state.new_checksum_state; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + /* * Block on a procsignalbarrier to await all processes having seen the * change to checksum status. Once the barrier has been passed we can diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index 8fdc518b3a1..421502fa1cb 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -68,6 +68,14 @@ AuxiliaryProcessMainCommon(void) BaseInit(); + /* + * Prevent consuming interrups between setting ProcSignalInit and setting + * the initial local data checksum value. If a barrier is emitted, and + * absorbed, before local cached state is initialized the state transition + * can be invalid. + */ + HOLD_INTERRUPTS(); + ProcSignalInit(NULL, 0); /* @@ -88,6 +96,8 @@ AuxiliaryProcessMainCommon(void) */ InitLocalDataChecksumState(); + RESUME_INTERRUPTS(); + /* * Auxiliary processes don't run transactions, but they may need a * resource owner anyway to manage buffer pins acquired outside diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c index e62a36b0af5..6d66322c625 100644 --- a/src/backend/postmaster/datachecksum_state.c +++ b/src/backend/postmaster/datachecksum_state.c @@ -263,8 +263,8 @@ static const ChecksumBarrierCondition checksum_barriers[7] = {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_VERSION}, /* - * If checksums are being enabled when launcher_exit is executed, state - * is set to off since we cannot reach on at that point. + * If checksums are being enabled when launcher_exit is executed, state is + * set to off since we cannot reach on at that point. */ {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_INPROGRESS_OFF}, }; @@ -1287,8 +1287,8 @@ DataChecksumsShmemRequest(void *arg) /* * DatabaseExists * - * Scans the system catalog to check if a database with the given Oid exist - * and returns true if it is found, else false. + * Scans the system catalog to check if a database with the given Oid exists + * and returns true if it is found else false. */ static bool DatabaseExists(Oid dboid) diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 6f074013aa9..b2c80b0bd1e 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -756,6 +756,14 @@ InitPostgres(const char *in_dbname, Oid dboid, */ SharedInvalBackendInit(false); + /* + * Prevent consuming interrups between setting ProcSignalInit and setting + * the initial local data checksum value. If a barrier is emitted, and + * absorbed, before local cached state is initialized the state transition + * can be invalid. + */ + HOLD_INTERRUPTS(); + ProcSignalInit(MyCancelKey, MyCancelKeyLength); /* @@ -776,6 +784,8 @@ InitPostgres(const char *in_dbname, Oid dboid, */ InitLocalDataChecksumState(); + RESUME_INTERRUPTS(); + /* * Also set up timeout handlers needed for backend operation. We need * these in every case except bootstrap. -- 2.39.3 (Apple Git-146)