From 5766e6506e569b0d91e22e90c5e6786ce9fefdf6 Mon Sep 17 00:00:00 2001 From: Craig Ringer Date: Mon, 23 Jan 2017 13:25:30 +0800 Subject: [PATCH 1/3] Fix race between clog truncation and lookup There was previously no way to look up an arbitrary xid without running the risk of having clog truncated out from under you. This hasn't previously been a problem because anything looking up xids in clog knows they're protected by datminxid, but that's not the case for arbitrary user-supplied XIDs. clog is truncated before we advanced oldestXid under XidGenLock, so holding XidGenLock during a clog lookup is insufficient to prevent the race. There's no way to look up a SLRU with soft-failure; attempting a lookup produces an I/O error. There's also no safe way to trap and swallow the SLRU lookup error due mainly to locking issues. To address this, introduce a copy of oldestXid, oldestClogXid, that is advanced before clog truncation under a new LWLock, CLogTruncationLock. Lookups of arbitrary XIDs must take and hold CLogTruncationLock to prevent concurrent advance of the minimum valid xid in clog. This race also exists in a worse form on standby servers. On a standby we only advance oldestXid when we replay the next checkpoint, so there's a much larger window between clog truncation and subsequent updating of the limit. Fix this by recording the oldest xid in clog truncation records and applying the update to oldestClogXid under ClogTruncationLock before replaying the clog truncation. No attempt is made to eagerly update oldestXid on the standby, so it may fall behind oldestClogXid until the next checkpoint. Note that there's no need to take ClogTruncationLock for normal clog lookups protected by datfrozenxid, only if accepting arbitrary XIDs that might not be protected by vacuum thresholds. --- doc/src/sgml/monitoring.sgml | 4 +++ src/backend/access/rmgrdesc/clogdesc.c | 12 +++++++-- src/backend/access/transam/clog.c | 46 +++++++++++++++++++++++++------- src/backend/access/transam/transam.c | 4 +-- src/backend/access/transam/varsup.c | 23 +++++++++++++++- src/backend/access/transam/xlog.c | 11 ++++++++ src/backend/commands/vacuum.c | 2 +- src/backend/storage/lmgr/lwlocknames.txt | 1 + src/include/access/clog.h | 8 +++++- src/include/access/transam.h | 7 +++++ 10 files changed, 101 insertions(+), 17 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index dcb2d33..1c84ce5 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1018,6 +1018,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting to read or update old snapshot control information. + CLogTruncationLock + Waiting to truncate the transaction log or waiting for transaction log truncation to finish. + + clog Waiting for I/O on a clog (transaction status) buffer. diff --git a/src/backend/access/rmgrdesc/clogdesc.c b/src/backend/access/rmgrdesc/clogdesc.c index 352de48..ef268c5 100644 --- a/src/backend/access/rmgrdesc/clogdesc.c +++ b/src/backend/access/rmgrdesc/clogdesc.c @@ -23,12 +23,20 @@ clog_desc(StringInfo buf, XLogReaderState *record) char *rec = XLogRecGetData(record); uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - if (info == CLOG_ZEROPAGE || info == CLOG_TRUNCATE) + if (info == CLOG_ZEROPAGE) { int pageno; memcpy(&pageno, rec, sizeof(int)); - appendStringInfo(buf, "%d", pageno); + appendStringInfo(buf, "page %d", pageno); + } + else if (info == CLOG_TRUNCATE) + { + xl_clog_truncate xlrec; + + memcpy(&xlrec, rec, sizeof(xl_clog_truncate)); + appendStringInfo(buf, "page %d; oldestXact %u", + xlrec.pageno, xlrec.oldestXact); } } diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 5b1d13d..2d33510 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -83,7 +83,8 @@ static SlruCtlData ClogCtlData; static int ZeroCLOGPage(int pageno, bool writeXlog); static bool CLOGPagePrecedes(int page1, int page2); static void WriteZeroPageXlogRec(int pageno); -static void WriteTruncateXlogRec(int pageno); +static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact, + Oid oldestXidDb); static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, TransactionId *subxids, XidStatus status, XLogRecPtr lsn, int pageno); @@ -640,7 +641,7 @@ ExtendCLOG(TransactionId newestXact) * the XLOG flush unless we have confirmed that there is a removable segment. */ void -TruncateCLOG(TransactionId oldestXact) +TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid) { int cutoffPage; @@ -654,8 +655,26 @@ TruncateCLOG(TransactionId oldestXact) if (!SlruScanDirectory(ClogCtl, SlruScanDirCbReportPresence, &cutoffPage)) return; /* nothing to remove */ - /* Write XLOG record and flush XLOG to disk */ - WriteTruncateXlogRec(cutoffPage); + /* + * Advance oldestClogXid before truncating clog, so concurrent xact status + * lookups can ensure they don't attempt to access truncated-away clog. + * + * It's only necessary to do this if we will actually truncate away clog + * pages. + */ + AdvanceOldestClogXid(oldestXact); + + /* vac_truncate_clog already advanced oldestXid */ + Assert(TransactionIdPrecedesOrEquals(oldestXact, + ShmemVariableCache->oldestXid)); + + /* + * Write XLOG record and flush XLOG to disk. We record the oldest xid we're + * keeping information about here so we can ensure that it's always ahead + * of clog truncation in case we crash, and so a standby finds out the new + * valid xid before the next checkpoint. + */ + WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid); /* Now we can remove the old CLOG segment(s) */ SimpleLruTruncate(ClogCtl, cutoffPage); @@ -704,12 +723,17 @@ WriteZeroPageXlogRec(int pageno) * in TruncateCLOG(). */ static void -WriteTruncateXlogRec(int pageno) +WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXactDb) { XLogRecPtr recptr; + xl_clog_truncate xlrec; + + xlrec.pageno = pageno; + xlrec.oldestXact = oldestXact; + xlrec.oldestXactDb = oldestXactDb; XLogBeginInsert(); - XLogRegisterData((char *) (&pageno), sizeof(int)); + XLogRegisterData((char *) (&xlrec), sizeof(xl_clog_truncate)); recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE); XLogFlush(recptr); } @@ -742,17 +766,19 @@ clog_redo(XLogReaderState *record) } else if (info == CLOG_TRUNCATE) { - int pageno; + xl_clog_truncate xlrec; - memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate)); /* * During XLOG replay, latest_page_number isn't set up yet; insert a * suitable value to bypass the sanity test in SimpleLruTruncate. */ - ClogCtl->shared->latest_page_number = pageno; + ClogCtl->shared->latest_page_number = xlrec.pageno; - SimpleLruTruncate(ClogCtl, pageno); + AdvanceOldestClogXid(xlrec.oldestXact); + + SimpleLruTruncate(ClogCtl, xlrec.pageno); } else elog(PANIC, "clog_redo: unknown op code %u", info); diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index b91a259..562b53b 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -119,7 +119,7 @@ TransactionLogFetch(TransactionId transactionId) * True iff transaction associated with the identifier did commit. * * Note: - * Assumes transaction identifier is valid. + * Assumes transaction identifier is valid and exists in clog. */ bool /* true if given transaction committed */ TransactionIdDidCommit(TransactionId transactionId) @@ -175,7 +175,7 @@ TransactionIdDidCommit(TransactionId transactionId) * True iff transaction associated with the identifier did abort. * * Note: - * Assumes transaction identifier is valid. + * Assumes transaction identifier is valid and exists in clog. */ bool /* true if given transaction aborted */ TransactionIdDidAbort(TransactionId transactionId) diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 42fc351..5efbfbd 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -259,7 +259,28 @@ ReadNewTransactionId(void) } /* - * Determine the last safe XID to allocate given the currently oldest + * Advance the cluster-wide value for the oldest valid clog entry. + * + * We must acquire CLogTruncationLock to advance the oldestClogXid. It's not + * necessary to hold the lock during the actual clog truncation, only when we + * advance the limit, as code looking up arbitrary xids is required to hold + * CLogTruncationLock from when it tests oldestClogXid through to when it + * completes the clog lookup. + */ +void +AdvanceOldestClogXid(TransactionId oldest_datfrozenxid) +{ + LWLockAcquire(CLogTruncationLock, LW_EXCLUSIVE); + if (TransactionIdPrecedes(ShmemVariableCache->oldestClogXid, + oldest_datfrozenxid)) + { + ShmemVariableCache->oldestClogXid = oldest_datfrozenxid; + } + LWLockRelease(CLogTruncationLock); +} + +/* + * Determine the last safe XID to allocate using the currently oldest * datfrozenxid (ie, the oldest XID that might exist in any database * of our cluster), and the OID of the (or a) database with that value. */ diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 9480377..fbdff55 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5016,6 +5016,7 @@ BootStrapXLOG(void) ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId); @@ -6622,6 +6623,7 @@ StartupXLOG(void) ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); SetCommitTsLimit(checkPoint.oldestCommitTsXid, @@ -8687,6 +8689,11 @@ CreateCheckPoint(int flags) /* * Get the other info we need for the checkpoint record. + * + * We don't need to save oldestClogXid in the checkpoint, it only matters + * for the short period in which clog is being truncated, and if we crash + * during that we'll redo the clog truncation and fix up oldestClogXid + * there. */ LWLockAcquire(XidGenLock, LW_SHARED); checkPoint.nextXid = ShmemVariableCache->nextXid; @@ -9616,6 +9623,10 @@ xlog_redo(XLogReaderState *record) MultiXactAdvanceOldest(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + /* + * No need to set oldestClogXid here as well; it'll be set when we + * redo an xl_clog_truncate if it changed since initialization. + */ SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); /* diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index ff633fa..c4a0f89 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1194,7 +1194,7 @@ vac_truncate_clog(TransactionId frozenXID, /* * Truncate CLOG, multixact and CommitTs to the oldest computed value. */ - TruncateCLOG(frozenXID); + TruncateCLOG(frozenXID, oldestxid_datoid); TruncateCommitTs(frozenXID); TruncateMultiXact(minMulti, minmulti_datoid); diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index cd8b08f..e6025ec 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -49,3 +49,4 @@ MultiXactTruncationLock 41 OldSnapshotTimeMapLock 42 BackendRandomLock 43 LogicalRepWorkerLock 44 +CLogTruncationLock 45 diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 2894bd5..60a9e11 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -28,6 +28,12 @@ typedef int XidStatus; #define TRANSACTION_STATUS_ABORTED 0x02 #define TRANSACTION_STATUS_SUB_COMMITTED 0x03 +typedef struct xl_clog_truncate +{ + int pageno; + TransactionId oldestXact; + Oid oldestXactDb; +} xl_clog_truncate; extern void TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, TransactionId *subxids, XidStatus status, XLogRecPtr lsn); @@ -42,7 +48,7 @@ extern void TrimCLOG(void); extern void ShutdownCLOG(void); extern void CheckPointCLOG(void); extern void ExtendCLOG(TransactionId newestXact); -extern void TruncateCLOG(TransactionId oldestXact); +extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid); /* XLOG stuff */ #define CLOG_ZEROPAGE 0x00 diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 522c104..d25a2dd 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -134,6 +134,12 @@ typedef struct VariableCacheData */ TransactionId latestCompletedXid; /* newest XID that has committed or * aborted */ + + /* + * These fields are protected by CLogTruncationLock + */ + TransactionId oldestClogXid; /* oldest it's safe to look up in clog */ + } VariableCacheData; typedef VariableCacheData *VariableCache; @@ -173,6 +179,7 @@ extern TransactionId GetNewTransactionId(bool isSubXact); extern TransactionId ReadNewTransactionId(void); extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid); +extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); -- 2.5.5