From 9528afdedbd1dccb5f6717b8411bbdbefc0fc6b3 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 8 Jul 2024 15:14:57 -0400 Subject: [PATCH v1] Do not summarize WAL if generated with wal_level=minimal. To do this, we must include the wal_level in the first WAL record covered by each summary file; so add wal_level to struct Checkpoint and the payload of XLOG_CHECKPOINT_REDO. This, in turn, requires bumping XLOG_PAGE_MAGIC and, since the Checkpoint is also stored in the control file, also PG_CONTROL_VERSION. It's not great to do that so late in the release cycle, but the alternative seems to ship v17 without robust protections against this scenario, which could result in corrupted incremental backups. Report by Fujii Masao. --- src/backend/access/rmgrdesc/xlogdesc.c | 42 ++++-- src/backend/access/transam/xlog.c | 7 +- src/backend/postmaster/walsummarizer.c | 178 +++++++++++++++++-------- src/include/access/xlog_internal.h | 2 +- src/include/catalog/pg_control.h | 3 +- 5 files changed, 158 insertions(+), 74 deletions(-) diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index e455400716..f06f5c2461 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -33,6 +33,27 @@ const struct config_enum_entry wal_level_options[] = { {NULL, 0, false} }; +/* + * Find a string representation for wal_level + */ +static const char * +get_wal_level_string(int wal_level) +{ + const struct config_enum_entry *entry; + const char *wal_level_str = "?"; + + for (entry = wal_level_options; entry->name; entry++) + { + if (entry->val == wal_level) + { + wal_level_str = entry->name; + break; + } + } + + return wal_level_str; +} + void xlog_desc(StringInfo buf, XLogReaderState *record) { @@ -45,7 +66,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) CheckPoint *checkpoint = (CheckPoint *) rec; appendStringInfo(buf, "redo %X/%X; " - "tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; " + "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " "oldest running xid %u; %s", @@ -53,6 +74,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->ThisTimeLineID, checkpoint->PrevTimeLineID, checkpoint->fullPageWrites ? "true" : "false", + get_wal_level_string(checkpoint->wal_level), EpochFromFullTransactionId(checkpoint->nextXid), XidFromFullTransactionId(checkpoint->nextXid), checkpoint->nextOid, @@ -95,20 +117,9 @@ xlog_desc(StringInfo buf, XLogReaderState *record) { xl_parameter_change xlrec; const char *wal_level_str; - const struct config_enum_entry *entry; memcpy(&xlrec, rec, sizeof(xl_parameter_change)); - - /* Find a string representation for wal_level */ - wal_level_str = "?"; - for (entry = wal_level_options; entry->name; entry++) - { - if (entry->val == xlrec.wal_level) - { - wal_level_str = entry->name; - break; - } - } + wal_level_str = get_wal_level_string(xlrec.wal_level); appendStringInfo(buf, "max_connections=%d max_worker_processes=%d " "max_wal_senders=%d max_prepared_xacts=%d " @@ -150,7 +161,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record) } else if (info == XLOG_CHECKPOINT_REDO) { - /* No details to write out */ + int wal_level; + + memcpy(&wal_level, rec, sizeof(int)); + appendStringInfo(buf, "wal_level %s", get_wal_level_string(wal_level)); } } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 33e27a6e72..fba015579e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6934,6 +6934,7 @@ CreateCheckPoint(int flags) WALInsertLockAcquireExclusive(); checkPoint.fullPageWrites = Insert->fullPageWrites; + checkPoint.wal_level = wal_level; if (shutdown) { @@ -6987,11 +6988,9 @@ CreateCheckPoint(int flags) */ if (!shutdown) { - int dummy = 0; - - /* Record must have payload to avoid assertion failure. */ + /* Include WAL level in record for WAL summarizer's benefit. */ XLogBeginInsert(); - XLogRegisterData((char *) &dummy, sizeof(dummy)); + XLogRegisterData((char *) &wal_level, sizeof(wal_level)); (void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO); /* diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 55dc2ea8f5..c6fbf5b55f 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -71,6 +71,12 @@ typedef struct * and so the LSN might point to the start of the next file even though * that might happen to be in the middle of a WAL record. * + * fast_forward is normally false, but is true when we have encountered + * WAL generated with wal_level=minimal and are skipping over it without + * emitting summary files. In this case, summarized_tli and summarized_lsn + * will advance even though nothing is being written to disk, until we + * again reach a point where wal_level>minimal. + * * summarizer_pgprocno is the proc number of the summarizer process, if * one is running, or else INVALID_PROC_NUMBER. * @@ -83,6 +89,7 @@ typedef struct TimeLineID summarized_tli; XLogRecPtr summarized_lsn; bool lsn_is_exact; + bool fast_forward; ProcNumber summarizer_pgprocno; XLogRecPtr pending_lsn; @@ -154,7 +161,8 @@ static void SummarizeSmgrRecord(XLogReaderState *xlogreader, BlockRefTable *brtab); static void SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab); -static bool SummarizeXlogRecord(XLogReaderState *xlogreader); +static bool SummarizeXlogRecord(XLogReaderState *xlogreader, + bool *new_fast_forward); static int summarizer_read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, @@ -802,6 +810,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, char final_path[MAXPGPATH]; WalSummaryIO io; BlockRefTable *brtab = CreateEmptyBlockRefTable(); + bool fast_forward = true; /* Initialize private data for xlogreader. */ private_data = (SummarizerReadLocalXLogPrivate *) @@ -900,7 +909,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, int block_id; char *errormsg; XLogRecord *record; - bool stop_requested = false; + uint8 rmid; HandleWalSummarizerInterrupts(); @@ -969,56 +978,86 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, break; } - /* Special handling for particular types of WAL records. */ - switch (XLogRecGetRmid(xlogreader)) - { - case RM_DBASE_ID: - SummarizeDbaseRecord(xlogreader, brtab); - break; - case RM_SMGR_ID: - SummarizeSmgrRecord(xlogreader, brtab); - break; - case RM_XACT_ID: - SummarizeXactRecord(xlogreader, brtab); - break; - case RM_XLOG_ID: - stop_requested = SummarizeXlogRecord(xlogreader); - break; - default: - break; - } - /* - * If we've been told that it's time to end this WAL summary file, do - * so. As an exception, if there's nothing included in this WAL - * summary file yet, then stopping doesn't make any sense, and we - * should wait until the next stop point instead. + * Certain types of records require special handling. Redo points and + * shutdown checkpoints trigger creation of new summary files and can + * also cause us to enter or exit "fast forward" mode. Other types of + * records can require special updates to the block reference table. */ - if (stop_requested && xlogreader->ReadRecPtr > summary_start_lsn) + rmid = XLogRecGetRmid(xlogreader); + if (rmid == RM_XLOG_ID) { - summary_end_lsn = xlogreader->ReadRecPtr; - break; + bool new_fast_forward; + + /* + * If we've already processed some WAL records when we hit a redo + * point or shutdown checkpoint, then we stop summarization before + * including this record in the current file, so that it will be + * the first record in the next file. + * + * When we hit one of those record types as the first record in a + * file, we adjust our notion of whether we're fast-forwarding. + * Any WAL generated with wal_level=minimal must be skipped + * without actually generating any summary file, because an + * incremental backup that crosses such WAL would be unsafe. + */ + if (SummarizeXlogRecord(xlogreader, &new_fast_forward)) + { + if (xlogreader->ReadRecPtr > summary_start_lsn) + { + summary_end_lsn = xlogreader->ReadRecPtr; + break; + } + else + fast_forward = new_fast_forward; + } + } + else if (!fast_forward) + { + /* + * This switch handles record types that require extra updates to + * the contents of the block reference table. + */ + switch (rmid) + { + case RM_DBASE_ID: + SummarizeDbaseRecord(xlogreader, brtab); + break; + case RM_SMGR_ID: + SummarizeSmgrRecord(xlogreader, brtab); + break; + case RM_XACT_ID: + SummarizeXactRecord(xlogreader, brtab); + break; + } } - /* Feed block references from xlog record to block reference table. */ - for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader); - block_id++) + /* + * If we're in fast-forward mode, we don't really need to do anything. + * Otherwise, feed block references from xlog record to block + * reference table. + */ + if (!fast_forward) { - RelFileLocator rlocator; - ForkNumber forknum; - BlockNumber blocknum; + for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader); + block_id++) + { + RelFileLocator rlocator; + ForkNumber forknum; + BlockNumber blocknum; - if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator, - &forknum, &blocknum, NULL)) - continue; + if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator, + &forknum, &blocknum, NULL)) + continue; - /* - * As we do elsewhere, ignore the FSM fork, because it's not fully - * WAL-logged. - */ - if (forknum != FSM_FORKNUM) - BlockRefTableMarkBlockModified(brtab, &rlocator, forknum, - blocknum); + /* + * As we do elsewhere, ignore the FSM fork, because it's not + * fully WAL-logged. + */ + if (forknum != FSM_FORKNUM) + BlockRefTableMarkBlockModified(brtab, &rlocator, forknum, + blocknum); + } } /* Update our notion of where this summary file ends. */ @@ -1047,9 +1086,10 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, /* * If a timeline switch occurs, we may fail to make any progress at all * before exiting the loop above. If that happens, we don't write a WAL - * summary file at all. + * summary file at all. We can also skip writing a file if we're in + * fast-forward mode. */ - if (summary_end_lsn > summary_start_lsn) + if (summary_end_lsn > summary_start_lsn && !fast_forward) { /* Generate temporary and final path name. */ snprintf(temp_path, MAXPGPATH, @@ -1085,6 +1125,14 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, durable_rename(temp_path, final_path, ERROR); } + /* If we skipped a non-zero amount of WAL, log a debug message. */ + if (summary_end_lsn > summary_start_lsn && fast_forward) + ereport(DEBUG1, + errmsg("skipped summarizing WAL on TLI %u from %X/%X to %X/%X", + tli, + LSN_FORMAT_ARGS(summary_start_lsn), + LSN_FORMAT_ARGS(summary_end_lsn))); + return summary_end_lsn; } @@ -1263,22 +1311,44 @@ SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab) /* * Special handling for WAL records with RM_XLOG_ID. + * + * The return value is true if WAL summarization should stop before this + * record and false otherwise. When the return value is true, + * *new_fast_forward indicates whether future processing should be done + * in fast forward mode (i.e. read WAL without emitting summaries) or not. */ static bool -SummarizeXlogRecord(XLogReaderState *xlogreader) +SummarizeXlogRecord(XLogReaderState *xlogreader, bool *new_fast_forward) { uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; + int record_wal_level; - if (info == XLOG_CHECKPOINT_REDO || info == XLOG_CHECKPOINT_SHUTDOWN) + if (info == XLOG_CHECKPOINT_REDO) { - /* - * This is an LSN at which redo might begin, so we'd like - * summarization to stop just before this WAL record. - */ - return true; + /* Payload is wal_level at the time record was written. */ + memcpy(&record_wal_level, XLogRecGetData(xlogreader), sizeof(int)); } + else if (info == XLOG_CHECKPOINT_SHUTDOWN) + { + CheckPoint rec_ckpt; - return false; + /* Extract wal_level at time record was written from payload. */ + memcpy(&rec_ckpt, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + record_wal_level = rec_ckpt.wal_level; + } + else + { + /* No special handling required. Return false. */ + return false; + } + + /* + * We only get here if this is an LSN at which redo might begin, and if + * that is the case, then we would like WAL summarization to stop just + * before this WAL record. + */ + *new_fast_forward = (record_wal_level == WAL_LEVEL_MINIMAL); + return true; } /* diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 5161b72f28..f270065fd3 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD115 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD116 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index a00606ffcd..e80ff8e414 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -22,7 +22,7 @@ /* Version identifier for this pg_control format */ -#define PG_CONTROL_VERSION 1300 +#define PG_CONTROL_VERSION 1700 /* Nonce key length, see below */ #define MOCK_AUTH_NONCE_LEN 32 @@ -40,6 +40,7 @@ typedef struct CheckPoint TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new * timeline (equals ThisTimeLineID otherwise) */ bool fullPageWrites; /* current full_page_writes */ + int wal_level; /* current wal_level */ FullTransactionId nextXid; /* next free transaction ID */ Oid nextOid; /* next free OID */ MultiXactId nextMulti; /* next free MultiXactId */ -- 2.39.3 (Apple Git-145)