Re: Corruption during WAL replay - Mailing list pgsql-hackers
From | Kyotaro Horiguchi |
---|---|
Subject | Re: Corruption during WAL replay |
Date | |
Msg-id | 20210106.173327.1444585955309078930.horikyota.ntt@gmail.com Whole thread Raw |
In response to | Re: Corruption during WAL replay (Andres Freund <andres@anarazel.de>) |
List | pgsql-hackers |
At Mon, 17 Aug 2020 11:22:15 -0700, Andres Freund <andres@anarazel.de> wrote in > Hi, > > On 2020-08-17 14:05:37 +0300, Heikki Linnakangas wrote: > > On 14/04/2020 22:04, Teja Mupparti wrote: > > > Thanks Kyotaro and Masahiko for the feedback. I think there is a > > > consensus on the critical-section around truncate, > > > > +1 > > I'm inclined to think that we should do that independent of the far more > complicated fix for other related issues. ... > > Perhaps a better approach would be to prevent the checkpoint from > > completing, until all in-progress truncations have completed. We have a > > mechanism to wait out in-progress commits at the beginning of a checkpoint, > > right after the redo point has been established. See comments around the > > GetVirtualXIDsDelayingChkpt() function call in CreateCheckPoint(). We could > > have a similar mechanism to wait out the truncations before *completing* a > > checkpoint. > > What I outlined earlier *is* essentially a way to do so, by preventing > checkpointing from finishing the buffer scan while a dangerous state > exists. Seems reasonable. The attached does that. It actually works for the initial case. regards. -- Kyotaro Horiguchi NTT Open Source Software Center diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 1233448481..e1d3068f14 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -3062,8 +3062,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) * crash/basebackup, even though the state of the data directory would * require it. */ - Assert(!MyProc->delayChkpt); - MyProc->delayChkpt = true; + Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE); + MyProc->delayChkpt = DELAY_CHKPT_START; /* WAL log truncation */ WriteMTruncateXlogRec(newOldestMultiDB, @@ -3089,7 +3089,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) /* Then offsets */ PerformOffsetsTruncation(oldestMulti, newOldestMulti); - MyProc->delayChkpt = false; + MyProc->delayChkpt = DELAY_CHKPT_NONE; END_CRIT_SECTION(); LWLockRelease(MultiXactTruncationLock); diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index fc18b77832..1b74a229d6 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -463,7 +463,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, proc->lxid = (LocalTransactionId) xid; proc->xid = xid; Assert(proc->xmin == InvalidTransactionId); - proc->delayChkpt = false; + proc->delayChkpt = DELAY_CHKPT_NONE; proc->statusFlags = 0; proc->pid = 0; proc->backendId = InvalidBackendId; @@ -1108,7 +1108,8 @@ EndPrepare(GlobalTransaction gxact) START_CRIT_SECTION(); - MyProc->delayChkpt = true; + Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE); + MyProc->delayChkpt = DELAY_CHKPT_START; XLogBeginInsert(); for (record = records.head; record != NULL; record = record->next) @@ -1151,7 +1152,7 @@ EndPrepare(GlobalTransaction gxact) * checkpoint starting after this will certainly see the gxact as a * candidate for fsyncing. */ - MyProc->delayChkpt = false; + MyProc->delayChkpt = DELAY_CHKPT_NONE; /* * Remember that we have this GlobalTransaction entry locked for us. If @@ -2199,7 +2200,8 @@ RecordTransactionCommitPrepared(TransactionId xid, START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - MyProc->delayChkpt = true; + Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE); + MyProc->delayChkpt = DELAY_CHKPT_START; /* * Emit the XLOG commit record. Note that we mark 2PC commits as @@ -2247,7 +2249,7 @@ RecordTransactionCommitPrepared(TransactionId xid, TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ - MyProc->delayChkpt = false; + MyProc->delayChkpt = DELAY_CHKPT_NONE; END_CRIT_SECTION(); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index a2068e3fd4..000206e506 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1334,8 +1334,9 @@ RecordTransactionCommit(void) * This makes checkpoint's determination of which xacts are delayChkpt * a bit fuzzy, but it doesn't matter. */ + Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE); START_CRIT_SECTION(); - MyProc->delayChkpt = true; + MyProc->delayChkpt = DELAY_CHKPT_START; SetCurrentTransactionStopTimestamp(); @@ -1436,7 +1437,7 @@ RecordTransactionCommit(void) */ if (markXidCommitted) { - MyProc->delayChkpt = false; + MyProc->delayChkpt = DELAY_CHKPT_NONE; END_CRIT_SECTION(); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ede93ad7fd..5b0a653408 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9024,18 +9024,30 @@ CreateCheckPoint(int flags) * and we will correctly flush the update below. So we cannot miss any * xacts we need to wait for. */ - vxids = GetVirtualXIDsDelayingChkpt(&nvxids); + vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START); if (nvxids > 0) { do { pg_usleep(10000L); /* wait for 10 msec */ - } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids)); + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids, + DELAY_CHKPT_START)); } pfree(vxids); CheckPointGuts(checkPoint.redo, flags); + vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE); + if (nvxids > 0) + { + do + { + pg_usleep(10000L); /* wait for 10 msec */ + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids, + DELAY_CHKPT_COMPLETE)); + } + pfree(vxids); + /* * Take a snapshot of running transactions and write this to WAL. This * allows us to reconstruct the state of running transactions during diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 7052dc245e..51f2581c06 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -923,7 +923,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) /* * Ensure no checkpoint can change our view of RedoRecPtr. */ - Assert(MyProc->delayChkpt); + Assert(MyProc->delayChkpt == DELAY_CHKPT_START); /* * Update RedoRecPtr so that we can make the right decision diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index cba7a9ada0..b75ea97b04 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -325,6 +325,16 @@ RelationTruncate(Relation rel, BlockNumber nblocks) RelationPreTruncate(rel); + /* + * If the file truncation fails but the concurrent checkpoint completes + * just before that, the next crash recovery can fail due to WAL records + * inconsistent with the untruncated pages. To avoid that situation we + * delay the checkpoint completion until we confirm the truncation to be + * successful. + */ + Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE); + MyProc->delayChkpt = DELAY_CHKPT_COMPLETE; + /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay @@ -373,6 +383,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks) */ if (need_fsm_vacuum) FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); + + MyProc->delayChkpt = DELAY_CHKPT_NONE; } /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 8f2c482bc8..52d70019c4 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3615,7 +3615,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) { XLogRecPtr lsn = InvalidXLogRecPtr; bool dirtied = false; - bool delayChkpt = false; + int delayChkpt = DELAY_CHKPT_NONE; uint32 buf_state; /* @@ -3665,7 +3665,8 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * essential that CreateCheckpoint waits for virtual transactions * rather than full transactionids. */ - MyProc->delayChkpt = delayChkpt = true; + Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE); + MyProc->delayChkpt = delayChkpt = DELAY_CHKPT_START; lsn = XLogSaveBufferForHint(buffer, buffer_std); } @@ -3698,7 +3699,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) UnlockBufHdr(bufHdr, buf_state); if (delayChkpt) - MyProc->delayChkpt = false; + MyProc->delayChkpt = DELAY_CHKPT_NONE; if (dirtied) { diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index cf12eda504..20757274ec 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -655,7 +655,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) proc->lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; - proc->delayChkpt = false; /* be sure this is cleared in abort */ + + /* be sure this is cleared in abort */ + proc->delayChkpt = DELAY_CHKPT_NONE; + proc->recoveryConflictPending = false; /* must be cleared with xid/xmin: */ @@ -694,7 +697,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) proc->xid = InvalidTransactionId; proc->lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; - proc->delayChkpt = false; /* be sure this is cleared in abort */ + + /* be sure this is cleared in abort */ + proc->delayChkpt = DELAY_CHKPT_NONE; + proc->recoveryConflictPending = false; /* must be cleared with xid/xmin: */ @@ -2929,7 +2935,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) * delaying checkpoint because they have critical actions in progress. * * Constructs an array of VXIDs of transactions that are currently in commit - * critical sections, as shown by having delayChkpt set in their PGPROC. + * critical sections, as shown by having delayChkpt set to the specified value + * in their PGPROC. * * Returns a palloc'd array that should be freed by the caller. * *nvxids is the number of valid entries. @@ -2943,13 +2950,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) * for clearing of delayChkpt to propagate is unimportant for correctness. */ VirtualTransactionId * -GetVirtualXIDsDelayingChkpt(int *nvxids) +GetVirtualXIDsDelayingChkpt(int *nvxids, DelayChkptType type) { VirtualTransactionId *vxids; ProcArrayStruct *arrayP = procArray; int count = 0; int index; + Assert(type != DELAY_CHKPT_NONE); + /* allocate what's certainly enough result space */ vxids = (VirtualTransactionId *) palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); @@ -2961,7 +2970,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids) int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - if (proc->delayChkpt) + if (proc->delayChkpt == type) { VirtualTransactionId vxid; @@ -2987,12 +2996,15 @@ GetVirtualXIDsDelayingChkpt(int *nvxids) * those numbers should be small enough for it not to be a problem. */ bool -HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) +HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, + DelayChkptType type) { bool result = false; ProcArrayStruct *arrayP = procArray; int index; + Assert(type != DELAY_CHKPT_NONE); + LWLockAcquire(ProcArrayLock, LW_SHARED); for (index = 0; index < arrayP->numProcs; index++) @@ -3003,7 +3015,7 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) GET_VXID_FROM_PGPROC(vxid, *proc); - if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid)) + if (proc->delayChkpt == type && VirtualTransactionIdIsValid(vxid)) { int i; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 9b6aa2fe0d..b7f9310afe 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -392,7 +392,7 @@ InitProcess(void) MyProc->roleId = InvalidOid; MyProc->tempNamespaceId = InvalidOid; MyProc->isBackgroundWorker = IsBackgroundWorker; - MyProc->delayChkpt = false; + MyProc->delayChkpt = DELAY_CHKPT_NONE; MyProc->statusFlags = 0; /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ if (IsAutoVacuumWorkerProcess()) @@ -573,7 +573,7 @@ InitAuxiliaryProcess(void) MyProc->roleId = InvalidOid; MyProc->tempNamespaceId = InvalidOid; MyProc->isBackgroundWorker = IsBackgroundWorker; - MyProc->delayChkpt = false; + MyProc->delayChkpt = DELAY_CHKPT_NONE; MyProc->statusFlags = 0; MyProc->lwWaiting = false; MyProc->lwWaitMode = 0; diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 989c5849d4..ca764a1a72 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -78,6 +78,14 @@ struct XidCache */ #define INVALID_PGPROCNO PG_INT32_MAX +/* type for PGPROC.delayChkpt */ +typedef enum DelayChkptType +{ + DELAY_CHKPT_NONE = 0, + DELAY_CHKPT_START, + DELAY_CHKPT_COMPLETE +} DelayChkptType; + typedef enum { PROC_WAIT_STATUS_OK, @@ -181,7 +189,8 @@ struct PGPROC LOCKMASK heldLocks; /* bitmask for lock types already held on this * lock object by this backend */ - bool delayChkpt; /* true if this proc delays checkpoint start */ + DelayChkptType delayChkpt; /* if this proc delays checkpoint start and/or + * completion. */ uint8 statusFlags; /* this backend's status flags, see PROC_* * above. mirrored in diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index b01fa52139..e560cada2b 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -15,11 +15,11 @@ #define PROCARRAY_H #include "storage/lock.h" +#include "storage/proc.h" #include "storage/standby.h" #include "utils/relcache.h" #include "utils/snapshot.h" - extern Size ProcArrayShmemSize(void); extern void CreateSharedProcArray(void); extern void ProcArrayAdd(PGPROC *proc); @@ -59,8 +59,10 @@ extern TransactionId GetOldestActiveTransactionId(void); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin); -extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids); -extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids); +extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, + DelayChkptType type); +extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, + int nvxids, DelayChkptType type); extern PGPROC *BackendPidGetProc(int pid); extern PGPROC *BackendPidGetProcWithLock(int pid);
pgsql-hackers by date: