From cf6da26344efaa31d06c9da1a89255ea29c23390 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 17 Feb 2021 00:29:36 +0000 Subject: [PATCH v1 1/2] Avoid creating archive status ".ready" files too early. WAL records may span multiple segments, but XLogWrite() does not wait for the entire record to be written out to disk before creating archive status files. Instead, as soon as the last WAL page of the segment is written, the archive status file will be created. If PostgreSQL crashes before it is able to write the rest of the record, it will end up reusing segments that have already been marked as ready-for-archival. However, the archiver process may have already processed the old version of the segment, so the wrong version of the segment may be backed-up. This backed-up segment will cause operations such as point-in-time restores to fail. To fix this, we keep track of records that span across segments and ensure that segments are only marked ready-for-archival once such records have been completely written to disk. --- src/backend/access/transam/xlog.c | 272 ++++++++++++++++++++++++++++++- src/backend/storage/lmgr/lwlocknames.txt | 1 + src/include/access/xlogdefs.h | 3 + 3 files changed, 269 insertions(+), 7 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index e0c37f73f3..950840d584 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -530,6 +530,13 @@ typedef enum ExclusiveBackupState */ static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE; +/* entries for RecordBoundaryMap, used to mark segments ready for archival */ +typedef struct RecordBoundaryEntry +{ + XLogSegNo seg; /* must be first */ + XLogRecPtr pos; +} RecordBoundaryEntry; + /* * Shared state data for WAL insertion. */ @@ -730,6 +737,12 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* + * The last segment we've marked ready for archival. Protected by info_lck. + * This value should only be updated while holding ArchNotifyLock. + */ + XLogSegNo lastNotifiedSeg; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -743,6 +756,12 @@ static WALInsertLockPadded *WALInsertLocks = NULL; */ static ControlFileData *ControlFile = NULL; +/* + * Record boundary map, used for marking segments as ready for archival. + * Protected by ArchNotifyLock. + */ +static HTAB *RecordBoundaryMap = NULL; + /* * Calculate the amount of space left on the page after 'endptr'. Beware * multiple evaluation! @@ -972,6 +991,12 @@ static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); static void checkXLogConsistency(XLogReaderState *record); +static void RegisterRecordBoundaryEntry(XLogSegNo seg, XLogRecPtr pos); +static void NotifySegmentsReadyForArchive(void); +static XLogSegNo GetLastNotifiedSegment(void); +static void SetLastNotifiedSegment(XLogSegNo seg); +static XLogSegNo GetLatestRecordBoundarySegment(void); +static void RemoveRecordBoundariesUpTo(XLogSegNo seg); static void WALInsertLockAcquire(void); static void WALInsertLockAcquireExclusive(void); @@ -1019,6 +1044,8 @@ XLogInsertRecord(XLogRecData *rdata, info == XLOG_SWITCH); XLogRecPtr StartPos; XLogRecPtr EndPos; + XLogSegNo StartSeg; + XLogSegNo EndSeg; bool prevDoPageWrites = doPageWrites; /* we assume that all of the record header is in the first chunk */ @@ -1177,6 +1204,31 @@ XLogInsertRecord(XLogRecData *rdata, SpinLockRelease(&XLogCtl->info_lck); } + /* + * Record the record boundary if we crossed the segment boundary. This is + * used to ensure that segments are not marked ready for archival before the + * entire record has been flushed to disk. + * + * Note that we do not use XLByteToPrevSeg() for determining the ending + * segment. This is done so that a record that fits perfectly into the end + * of the segment is marked ready for archival as soon as the flushed + * pointer jumps to the next segment. + */ + XLByteToSeg(StartPos, StartSeg, wal_segment_size); + XLByteToSeg(EndPos, EndSeg, wal_segment_size); + + if (StartSeg != EndSeg && + XLogArchivingActive()) + { + RegisterRecordBoundaryEntry(EndSeg, EndPos); + + /* + * There's a chance that the record was already flushed to disk and we + * missed marking segments as ready for archive, so try to do that now. + */ + NotifySegmentsReadyForArchive(); + } + /* * If this was an XLOG_SWITCH record, flush the record and the empty * padding space that fills the rest of the segment, and perform @@ -1275,6 +1327,192 @@ XLogInsertRecord(XLogRecData *rdata, return EndPos; } +/* + * RegisterRecordBoundaryEntry + * + * This enters a new entry into the record boundary map, which is used for + * determing when it is safe to mark a segment as ready for archival. An entry + * with the given key (the segment number) must not already exist in the map. + * Also, the caller is responsible for ensuring that XLByteToSeg() would return + * the same segment number for the given record pointer. + */ +static void +RegisterRecordBoundaryEntry(XLogSegNo seg, XLogRecPtr pos) +{ + RecordBoundaryEntry *entry; + bool found; + + LWLockAcquire(ArchNotifyLock, LW_EXCLUSIVE); + + entry = (RecordBoundaryEntry *) hash_search(RecordBoundaryMap, + (void *) &seg, HASH_ENTER, + &found); + if (found) + elog(ERROR, "record boundary entry for segment already exists"); + + entry->pos = pos; + + LWLockRelease(ArchNotifyLock); +} + +/* + * NotifySegmentsReadyForArchive + * + * This function marks segments as ready for archival, given that it is safe to + * do so. It is safe to call this function repeatedly, even if nothing has + * changed since the last time it was called. + */ +static void +NotifySegmentsReadyForArchive(void) +{ + XLogRecPtr flushed; + XLogSegNo flushed_seg; + XLogSegNo latest_boundary_seg; + XLogSegNo last_notified; + + /* + * We first do a quick sanity check to see if we can bail out without taking + * the ArchNotifyLock at all. It is expected that this function will run + * frequently and that it will need to do nothing the vast majority of the + * time. + * + * Specifically, we bail out if the shared memory value for the last + * notified segment has not yet been initialized or if we've already marked + * the segment prior to the segment that contains "flushed" as ready for + * archival. We intentionally use XLByteToSeg() instead of + * XLByteToPrevSeg() so that we don't skip notifying when a record fits + * perfectly into the end of a segment. ("flushed" should point to the + * first byte of the record _after_ the one that is known to be flushed to + * disk.) + */ + last_notified = GetLastNotifiedSegment(); + if (XLogSegNoIsInvalid(last_notified)) + return; + + flushed = GetFlushRecPtr(); + XLByteToSeg(flushed, flushed_seg, wal_segment_size); + if (last_notified >= flushed_seg - 1) + return; + + /* + * At this point, we must acquire ArchNotifyLock before proceeding. In this + * section, we look for the latest record boundary in RecordBoundaryMap that + * is less than or equal to the current "flushed" pointer, and we notify the + * archiver that all segments up to (but not including) that boundary's + * associated segment are ready for archival. + */ + LWLockAcquire(ArchNotifyLock, LW_EXCLUSIVE); + + latest_boundary_seg = GetLatestRecordBoundarySegment(); + if (!XLogSegNoIsInvalid(latest_boundary_seg)) + { + XLogSegNo i; + + /* create the archive status files */ + for (i = GetLastNotifiedSegment() + 1; i < latest_boundary_seg; i++) + XLogArchiveNotifySeg(i); + + /* update shared memory */ + SetLastNotifiedSegment(latest_boundary_seg - 1); + + /* remove old boundaries from the map */ + RemoveRecordBoundariesUpTo(latest_boundary_seg); + } + + LWLockRelease(ArchNotifyLock); +} + +/* + * GetLatestRecordBoundarySegment + * + * This function finds the latest record boundary in RecordBoundaryMap that is + * less than or equal to the current "flushed" pointer and returns its + * associated segment number, given that it is greater than the last notified + * segment. Otherwise, InvalidXLogSegNo is returned. + * + * Caller is expected to be holding ArchNotifyLock. + */ +static XLogSegNo +GetLatestRecordBoundarySegment(void) +{ + XLogRecPtr flushed; + XLogSegNo flushed_seg; + XLogSegNo last_notified; + + flushed = GetFlushRecPtr(); + XLByteToSeg(flushed, flushed_seg, wal_segment_size); + last_notified = GetLastNotifiedSegment(); + + for (XLogSegNo i = flushed_seg; i > last_notified; i--) + { + RecordBoundaryEntry *entry; + + entry = (RecordBoundaryEntry *) hash_search(RecordBoundaryMap, + (void *) &i, HASH_FIND, + NULL); + + if (entry != NULL && flushed >= entry->pos) + return entry->seg; + } + + return InvalidXLogSegNo; +} + +/* + * RemoveOldRecordBoundaries + * + * This function removes all entries in the RecordBoundaryMap with segment + * numbers up to an including seg. + * + * Caller is expected to be holding ArchNotifyLock. + */ +static void +RemoveRecordBoundariesUpTo(XLogSegNo seg) +{ + RecordBoundaryEntry *entry; + HASH_SEQ_STATUS status; + + hash_seq_init(&status, RecordBoundaryMap); + + while ((entry = (RecordBoundaryEntry *) hash_seq_search(&status)) != NULL) + { + if (entry->seg <= seg) + (void) hash_search(RecordBoundaryMap, (void *) &entry->seg, + HASH_REMOVE, NULL); + } +} + +/* + * GetLastNotifiedSegment + * + * Retrieves last notified segment from shared memory. + */ +XLogSegNo +GetLastNotifiedSegment(void) +{ + XLogSegNo seg; + + SpinLockAcquire(&XLogCtl->info_lck); + seg = XLogCtl->lastNotifiedSeg; + SpinLockRelease(&XLogCtl->info_lck); + + return seg; +} + +/* + * SetLastNotifiedSegment + * + * Sets last notified segment in shared memory. Callers should hold + * ArchNotifyLock exclusively when calling this function. + */ +static void +SetLastNotifiedSegment(XLogSegNo seg) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->lastNotifiedSeg = seg; + SpinLockRelease(&XLogCtl->info_lck); +} + /* * Reserves the right amount of space for a record of given size from the WAL. * *StartPos is set to the beginning of the reserved section, *EndPos to @@ -2579,11 +2817,11 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) * later. Doing it here ensures that one and only one backend will * perform this fsync. * - * This is also the right place to notify the Archiver that the - * segment is ready to copy to archival storage, and to update the - * timer for archive_timeout, and to signal for a checkpoint if - * too many logfile segments have been used since the last - * checkpoint. + * This is also the right place to update the timer for + * archive_timeout and to signal for a checkpoint if too many + * logfile segments have been used since the last checkpoint. If + * lastNotifiedSeg hasn't been initialized yet, we need to do that, + * too. */ if (finishing_seg) { @@ -2594,8 +2832,13 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) LogwrtResult.Flush = LogwrtResult.Write; /* end of page */ - if (XLogArchivingActive()) - XLogArchiveNotifySeg(openLogSegNo); + if (XLogArchivingActive() && + XLogSegNoIsInvalid(GetLastNotifiedSegment())) + { + LWLockAcquire(ArchNotifyLock, LW_EXCLUSIVE); + SetLastNotifiedSegment(openLogSegNo - 1); + LWLockRelease(ArchNotifyLock); + } XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush; @@ -2683,6 +2926,9 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush; SpinLockRelease(&XLogCtl->info_lck); } + + if (XLogArchivingActive()) + NotifySegmentsReadyForArchive(); } /* @@ -5096,6 +5342,9 @@ XLOGShmemSize(void) /* and the buffers themselves */ size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers)); + /* stuff for marking segments as ready for archival */ + size = add_size(size, hash_estimate_size(16, sizeof(RecordBoundaryEntry))); + /* * Note: we don't count ControlFileData, it comes out of the "slop factor" * added by CreateSharedMemoryAndSemaphores. This lets us use this @@ -5113,6 +5362,7 @@ XLOGShmemInit(void) char *allocptr; int i; ControlFileData *localControlFile; + HASHCTL info; #ifdef WAL_DEBUG @@ -5210,6 +5460,14 @@ XLOGShmemInit(void) SpinLockInit(&XLogCtl->info_lck); SpinLockInit(&XLogCtl->ulsn_lck); InitSharedLatch(&XLogCtl->recoveryWakeupLatch); + + /* Initialize stuff for marking segments as ready for archival. */ + XLogCtl->lastNotifiedSeg = InvalidXLogSegNo; + memset(&info, 0, sizeof(info)); + info.keysize = sizeof(XLogSegNo); + info.entrysize = sizeof(RecordBoundaryEntry); + RecordBoundaryMap = ShmemInitHash("Record Boundary Table", 16, 16, &info, + HASH_ELEM | HASH_BLOBS); } /* diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c295..d39225bf94 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +ArchNotifyLock 48 diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h index 65836d5bc6..d24c0c9e19 100644 --- a/src/include/access/xlogdefs.h +++ b/src/include/access/xlogdefs.h @@ -40,6 +40,9 @@ typedef uint64 XLogRecPtr; */ typedef uint64 XLogSegNo; +#define InvalidXLogSegNo 0xFFFFFFFFFFFFFFFF +#define XLogSegNoIsInvalid(s) ((s) == InvalidXLogSegNo) + /* * TimeLineID (TLI) - identifies different database histories to prevent * confusion after restoring a prior state of a database installation. -- 2.16.6