diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index adab2f8..f9cfe16 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2476,6 +2476,26 @@ include_dir 'conf.d' + + wal_consistency (boolean) + + wal_consistency configuration parameter + + + + + This parameter is used to check the consistency of WAL records, i.e, + whether the WAL records are inserted and applied correctly. When + wal_consistency is enabled for a WAL record, it + stores a full-page image of each page modified along with the WAL + record, inducing an increase of WAL generation. Then, When a + full-page image arrives during redo, it compares against the current + page to check whether both are consistent or not. The default value + is off. Only superusers can change this setting. + + + + wal_buffers (integer) diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index 5a6b728..a19cfbb 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -14,6 +14,7 @@ #include "access/brin_pageops.h" #include "access/brin_xlog.h" #include "access/xlogutils.h" +#include "storage/bufmask.h" /* @@ -279,3 +280,38 @@ brin_redo(XLogReaderState *record) elog(PANIC, "brin_redo: unknown op code %u", info); } } + +/* + * Mask a BRIN page before doing consistency checks. + */ +void +brin_mask(char *page) +{ + Page page_norm = (Page) page; + OffsetNumber offnum, + maxoff; + + mask_page_lsn(page_norm); + + mask_page_hint_bits(page_norm); + + if (BRIN_IS_REGULAR_PAGE(page_norm)) + { + mask_unused_space(page_norm); + + maxoff = PageGetMaxOffsetNumber(page_norm); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page_norm, offnum); + + if (ItemIdIsUsed(itemId)) + itemId->lp_flags = LP_UNUSED; + } + } + + /* + * If necessary, handle the case of meta and revmap pages here. + */ +} diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index a40f168..90b6386 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -15,6 +15,7 @@ #include "access/gin_private.h" #include "access/xlogutils.h" +#include "storage/bufmask.h" #include "utils/memutils.h" static MemoryContext opCtx; /* working memory for operations */ @@ -758,3 +759,28 @@ gin_xlog_cleanup(void) MemoryContextDelete(opCtx); opCtx = NULL; } + +void +gin_mask(char *page) +{ + Page page_norm = (Page) page; + GinPageOpaque opaque; + + mask_page_lsn(page_norm); + opaque = GinPageGetOpaque(page_norm); + + /* GIN metapage doesn't use pd_lower/pd_upper. Other page types do. */ + if (opaque->flags != GIN_META) + { + mask_page_hint_bits(page_norm); + + /* + * For GIN_DELETED page, the page is initialized to empty. + * Hence mask everything. + */ + if (opaque->flags & GIN_DELETED) + memset(page_norm, MASK_MARKER, BLCKSZ); + else + mask_unused_space(page_norm); + } +} diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 5853d76..d0573f0 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -16,6 +16,7 @@ #include "access/gist_private.h" #include "access/xloginsert.h" #include "access/xlogutils.h" +#include "storage/bufmask.h" #include "utils/memutils.h" static MemoryContext opCtx; /* working memory for operations */ @@ -343,6 +344,55 @@ gist_xlog_cleanup(void) } /* + * Mask a GIN page before running consistency checks on it. + */ +void +gist_mask(char *page) +{ + Page page_norm = (Page) page; + OffsetNumber offnum, + maxoff; + + mask_page_lsn(page_norm); + + mask_page_hint_bits(page_norm); + mask_unused_space(page_norm); + + /* Mask NSN */ + /* XXX: Rework that */ + GistPageSetNSN(page_norm, 0xFFFFFFFFFFFFFFFF); + + /* + * We update F_FOLLOW_RIGHT flag on the left child after writing WAL record. + * Hence, mask this flag. + */ + GistMarkFollowRight(page_norm); + + if (GistPageIsLeaf(page_norm)) + { + /* + * For gist leaf pages, + * Mask some line pointer bits, particularly those marked as + * used on a master and unused on a standby. + * XXX: This could be refined. + */ + maxoff = PageGetMaxOffsetNumber(page_norm); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page_norm, offnum); + + if (ItemIdIsUsed(itemId)) + itemId->lp_flags = LP_UNUSED; + } + } + + /* In Gist redo, we never mark a page as garbage. Hence, Mask It.*/ + GistClearPageHasGarbage(page_norm); +} + +/* * Write WAL record of a page split. */ XLogRecPtr diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index e3b1eef..aa4705a 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -25,6 +25,7 @@ #include "commands/vacuum.h" #include "miscadmin.h" #include "optimizer/plancat.h" +#include "storage/bufmask.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" @@ -711,3 +712,53 @@ hash_redo(XLogReaderState *record) { elog(PANIC, "hash_redo: unimplemented"); } + +/* + * Mask a hash page before performing consistency checks on it. + */ +void +hash_mask(char *page) +{ + Page page_norm = (Page) page; + OffsetNumber off; + OffsetNumber maxoff; + HashPageOpaque opaque; + + mask_page_lsn(page_norm); + + mask_page_hint_bits(page_norm); + mask_unused_space(page_norm); + + opaque = (HashPageOpaque) PageGetSpecialPointer(page_norm); + + /* + * Mask everything on a UNUSED page. + */ + if (opaque->hasho_flag & LH_UNUSED_PAGE) + { + /* Page content, between standard page header and opaque struct */ + memset(page_norm + SizeOfPageHeaderData, MASK_MARKER, + BLCKSZ - MAXALIGN(sizeof(HashPageOpaqueData)) - SizeOfPageHeaderData); + + /* pd_lower and upper */ + memset(&((PageHeader) page_norm)->pd_lower, MASK_MARKER, sizeof(uint16)); + memset(&((PageHeader) page_norm)->pd_upper, MASK_MARKER, sizeof(uint16)); + } + else if ((opaque->hasho_flag & LH_META_PAGE)== 0) + { + /* + * For pages other than metapage, + * Mask some line pointer bits, particularly those marked as + * used on a master and unused on a standby. + * XXX: This could be refined. + */ + maxoff = PageGetMaxOffsetNumber(page_norm); + for (off = 1; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page_norm, off); + + if (ItemIdIsUsed(iid)) + iid->lp_flags = LP_UNUSED; + } + } +} diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b019bc1..52b157a 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -57,6 +57,7 @@ #include "catalog/namespace.h" #include "miscadmin.h" #include "pgstat.h" +#include "storage/bufmask.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" @@ -9131,3 +9132,65 @@ heap_sync(Relation rel) heap_close(toastrel, AccessShareLock); } } + +/* + * Mask a heap page before performing consistency checks on it. + */ +void +heap_mask(char *page) +{ + Page page_norm = (Page) page; + OffsetNumber off; + + /* + * Mask the Page LSN. Because, we store the page before updating the LSN. + * Hence, LSNs of both pages will always be different. + */ + mask_page_lsn(page_norm); + + mask_page_hint_bits(page_norm); + mask_unused_space(page_norm); + + for (off = 1; off <= PageGetMaxOffsetNumber(page_norm); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page_norm + ItemIdGetOffset(iid)); + + /* + * Ignore hint bits and command ID. + */ + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + page_htup->t_infomask = + HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | + HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID; + page_htup->t_infomask |= HEAP_XACT_MASK; + page_htup->t_choice.t_heap.t_field3.t_cid = 0xFFFFFFFF; + + /* + * For a speculative tuple, the content of t_ctid is conflicting + * between the backup page and current page. Hence, we set it + * to block number 0 and current offset. + */ + if (HeapTupleHeaderIsSpeculative(page_htup)) + ItemPointerSet(&page_htup->t_ctid, 0, off); + } + + /* + * Ignore any padding bytes after the tuple, when the length of + * the item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index c536e22..8dc6234 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -19,6 +19,7 @@ #include "access/transam.h" #include "access/xlog.h" #include "access/xlogutils.h" +#include "storage/bufmask.h" #include "storage/procarray.h" #include "miscadmin.h" @@ -1028,3 +1029,59 @@ btree_redo(XLogReaderState *record) elog(PANIC, "btree_redo: unknown op code %u", info); } } + +/* + * Mask a btree page before performing consistency checks on it. + */ +void +btree_mask(char *page) +{ + Page page_norm = (Page) page; + OffsetNumber off; + OffsetNumber maxoff; + BTPageOpaque maskopaq; + + /* + * Mask the Page LSN. Because, we store the page before updating the LSN. + * Hence, LSNs of both pages will always be different. + */ + mask_page_lsn(page_norm); + + mask_page_hint_bits(page_norm); + mask_unused_space(page_norm); + + maskopaq = (BTPageOpaque) + (((char *) page_norm) + ((PageHeader) page_norm)->pd_special); + /* + * Mask everything on a DELETED page. + */ + if (((BTPageOpaque) PageGetSpecialPointer(page_norm))->btpo_flags & BTP_DELETED) + { + /* Page content, between standard page header and opaque struct */ + memset(page_norm + SizeOfPageHeaderData, MASK_MARKER, + BLCKSZ - SizeOfPageHeaderData); + + /* pd_lower and upper */ + memset(&((PageHeader) page_norm)->pd_lower, MASK_MARKER, sizeof(uint16)); + memset(&((PageHeader) page_norm)->pd_upper, MASK_MARKER, sizeof(uint16)); + } + else + { + /* + * Mask some line pointer bits, particularly those marked as + * used on a master and unused on a standby. + * XXX: This could be refined. + */ + maxoff = PageGetMaxOffsetNumber(page_norm); + for (off = 1; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page_norm, off); + + if (ItemIdIsUsed(iid)) + iid->lp_flags = LP_UNUSED; + } + } + + maskopaq->btpo_flags |= BTP_SPLIT_END | BTP_HAS_GARBAGE; + maskopaq->btpo_cycleid = 0; +} diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index e016cdb..4ec8688 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -18,6 +18,7 @@ #include "access/transam.h" #include "access/xlog.h" #include "access/xlogutils.h" +#include "storage/bufmask.h" #include "storage/standby.h" #include "utils/memutils.h" @@ -1023,3 +1024,23 @@ spg_xlog_cleanup(void) MemoryContextDelete(opCtx); opCtx = NULL; } + +/* + * Mask a SpGist page + */ +void +spg_mask(char *page) +{ + Page page_norm = (Page) page; + + /* + * Mask the Page LSN. Because, we store the page before updating the LSN. + * Hence, LSNs of both pages will always be different. + */ + mask_page_lsn(page_norm); + + mask_page_hint_bits(page_norm); + + if (!SpGistPageIsMeta(page_norm)) + mask_unused_space(page_norm); +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 9bb1362..eae7524 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -30,8 +30,8 @@ #include "utils/relmapper.h" /* must be kept in sync with RmgrData definition in xlog_internal.h */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ - { name, redo, desc, identify, startup, cleanup }, +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ + { name, redo, desc, identify, startup, cleanup, mask }, const RmgrData RmgrTable[RM_MAX_ID + 1] = { #include "access/rmgrlist.h" diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c9bb46b..0ce4e5c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -95,6 +95,7 @@ bool EnableHotStandby = false; bool fullPageWrites = true; bool wal_log_hints = false; bool wal_compression = false; +bool wal_consistency = false; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; @@ -867,6 +868,7 @@ static char *GetXLogBuffer(XLogRecPtr ptr); static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); +static void checkConsistency(XLogReaderState *record); static void WALInsertLockAcquire(void); static void WALInsertLockAcquireExclusive(void); @@ -903,8 +905,9 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) pg_crc32c rdata_crc; bool inserted; XLogRecord *rechdr = (XLogRecord *) rdata->data; + uint8 info = rechdr->xl_info & ~XLR_INFO_MASK; bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && - rechdr->xl_info == XLOG_SWITCH); + info == XLOG_SWITCH); XLogRecPtr StartPos; XLogRecPtr EndPos; @@ -1261,6 +1264,91 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) } /* + * Checks whether the current buffer page and backup page stored in the + * WAL record are consistent or not. Before comparing the two pages, a + * masking is applied to the pages to ignore certain areas like hint bits, + * unused space between pd_lower and pd_upper among other things. This + * function should be called once WAL replay has been completed for a + * given record. + */ +static void +checkConsistency(XLogReaderState *record) +{ + RmgrId rmid = XLogRecGetRmid(record); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + int block_id; + + /* records with no backup blocks have no need for consistency checks */ + if (!XLogRecHasAnyBlockRefs(record)) + return; + + /* + * Leave if no masking functions defined, this is possible in the case + * resource managers generating just full page writes, comparing an + * image to itself has no meaning in those cases. + */ + if (!RmgrTable[rmid].rm_mask) + return; + + Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); + + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + Buffer buf; + Page new_page; + char norm_new_page[BLCKSZ]; + char norm_old_page[BLCKSZ]; + + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + { + /* Caller specified a bogus block_id. Do nothing. */ + continue; + } + + /* + * Read the contents from the current buffer and store it in a + * temporary page. + */ + buf = XLogReadBufferExtended(rnode, forknum, blkno, + RBM_NORMAL); + if (!BufferIsValid(buf)) + continue; + + new_page = BufferGetPage(buf); + + /* + * Read the contents from the backup copy, stored in WAL record + * and store it in a temporary page. There is not need to allocate + * a new page here, a local buffer is fine to hold its contents and + * a mask can be directly applied on it. + */ + if (!RestoreBlockImage(record, block_id, norm_old_page)) + elog(ERROR, "failed to restore block image"); + + /* + * Take a copy of the new page where WAL has been applied to have + * a comparison base before masking it... + */ + memcpy(norm_new_page, new_page, BLCKSZ); + + /* ... And mask both the new and old pages */ + RmgrTable[rmid].rm_mask(norm_new_page); + RmgrTable[rmid].rm_mask(norm_old_page); + + /* Time to compare the old and new contents */ + if (memcmp(norm_new_page, norm_old_page, BLCKSZ) != 0) + elog(FATAL, + "Inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, blkno); + + ReleaseBuffer(buf); + } +} + +/* * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved * area in the WAL. */ @@ -6948,6 +7036,15 @@ StartupXLOG(void) /* Now apply the WAL record itself */ RmgrTable[record->xl_rmid].rm_redo(xlogreader); + /* + * After redo, check whether the backup pages associated with + * the WAL record are consistent with the existing pages. This + * check is done only if consistency check is enabled for this + * record. + */ + if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0) + checkConsistency(xlogreader); + /* Pop the error context stack */ error_context_stack = errcallback.previous; @@ -7785,6 +7882,7 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int whichChkpt, bool report) { XLogRecord *record; + uint8 info; if (!XRecOffIsValid(RecPtr)) { @@ -7810,6 +7908,7 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, } record = ReadRecord(xlogreader, RecPtr, LOG, true); + info = record->xl_info & ~XLR_INFO_MASK; if (record == NULL) { @@ -7852,8 +7951,8 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, } return NULL; } - if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN && - record->xl_info != XLOG_CHECKPOINT_ONLINE) + if (info != XLOG_CHECKPOINT_SHUTDOWN && + info != XLOG_CHECKPOINT_ONLINE) { switch (whichChkpt) { diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 3cd273b..db5f37f 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -414,10 +414,12 @@ XLogInsert(RmgrId rmid, uint8 info) elog(ERROR, "XLogBeginInsert was not called"); /* - * The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are - * reserved for use by me. + * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and + * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me. */ - if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE)) != 0) + if ((info & ~(XLR_RMGR_INFO_MASK | + XLR_SPECIAL_REL_UPDATE | + XLR_CHECK_CONSISTENCY)) != 0) elog(PANIC, "invalid xlog info mask %02X", info); TRACE_POSTGRESQL_XLOG_INSERT(rmid, info); @@ -513,6 +515,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecordBlockCompressHeader cbimg = {0}; bool samerel; bool is_compressed = false; + bool include_image; /* Whether backup image should be included in WAL record */ if (!regbuf->in_use) continue; @@ -556,7 +559,13 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT) bkpb.fork_flags |= BKPBLOCK_WILL_INIT; - if (needs_backup) + /* + * If needs_backup is true or wal consistency check is enabled for + * current rmid, log a full-page write for the current block. + */ + include_image = needs_backup || wal_consistency; + + if (include_image) { Page page = regbuf->page; uint16 compressed_len; @@ -680,7 +689,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, /* Ok, copy the header to the scratch buffer */ memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader); scratch += SizeOfXLogRecordBlockHeader; - if (needs_backup) + if (include_image) { memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); scratch += SizeOfXLogRecordBlockImageHeader; @@ -756,6 +765,13 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, rechdr->xl_prev = InvalidXLogRecPtr; rechdr->xl_crc = rdata_crc; + /* + * Enforce consistency checks for this record if user is looking for + * it. + */ + if (wal_consistency) + rechdr->xl_info |= XLR_CHECK_CONSISTENCY; + return &hdr_rdt; } diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index f2da505..56d4c66 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -462,7 +462,8 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) /* * Special processing if it's an XLOG SWITCH record */ - if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH) + if (record->xl_rmid == RM_XLOG_ID && + (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) { /* Pretend it extends to end of segment */ state->EndRecPtr += XLogSegSize - 1; diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index fc3a8ee..1eaf79f 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -31,6 +31,7 @@ #include "funcapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" +#include "storage/bufmask.h" #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/smgr.h" @@ -1646,3 +1647,14 @@ ResetSequenceCaches(void) last_used_seq = NULL; } + +/* + * Mask a Sequence page before performing consistency checks on it. + */ +void +seq_mask(char *page) +{ + mask_page_lsn(page); + + mask_unused_space(page); +} diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile index 2c10fba..8630dca 100644 --- a/src/backend/storage/buffer/Makefile +++ b/src/backend/storage/buffer/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o +OBJS = buf_table.o buf_init.o bufmask.o bufmgr.o freelist.o localbuf.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/bufmask.c b/src/backend/storage/buffer/bufmask.c new file mode 100644 index 0000000..f30c477 --- /dev/null +++ b/src/backend/storage/buffer/bufmask.c @@ -0,0 +1,78 @@ +/*------------------------------------------------------------------------- + * + * bufmask.c + * Routines for buffer masking, used to ensure that buffers used for + * comparison across nodes are in a consistent state. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * Contains common routines required for masking a page. + * + * IDENTIFICATION + * src/backend/storage/buffer/bufmask.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "storage/bufmask.h" + +/* + * Mask Page LSN + */ +void +mask_page_lsn(Page page) +{ + PageHeader phdr = (PageHeader) page; + + PageXLogRecPtrSet(phdr->pd_lsn, 0xFFFFFFFFFFFFFFFF); +} + +/* + * Mask hint bits in PageHeader + */ +void +mask_page_hint_bits(Page page) +{ + PageHeader phdr = (PageHeader) page; + + /* Ignore prune_xid (it's like a hint-bit) */ + phdr->pd_prune_xid = 0xFFFFFFFF; + + /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints */ + phdr->pd_flags |= PD_PAGE_FULL | PD_HAS_FREE_LINES; + + /* + * Also mask the all-visible flag. + * + * XXX: It is unfortunate that we have to do this. If the flag is set + * incorrectly, that's serious, and we would like to catch it. If the flag + * is cleared incorrectly, that's serious too. But redo of HEAP_CLEAN + * records don't currently set the flag, even though it is set in the + * master, so we must silence failures that that causes. + */ + phdr->pd_flags |= PD_ALL_VISIBLE; +} + +/* + * Mask the unused space of a page between pd_lower and pd_upper. + */ +void +mask_unused_space(Page page) +{ + int pd_lower = ((PageHeader) page)->pd_lower; + int pd_upper = ((PageHeader) page)->pd_upper; + int pd_special = ((PageHeader) page)->pd_special; + + /* Sanity check */ + if (pd_lower > pd_upper || pd_special < pd_upper || + pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) + { + elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n", + pd_lower, pd_upper, pd_special); + } + + memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower); +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 65660c1..d0416ae 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -28,9 +28,11 @@ #include "access/commit_ts.h" #include "access/gin.h" +#include "access/rmgr.h" #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog_internal.h" #include "catalog/namespace.h" #include "commands/async.h" #include "commands/prepare.h" @@ -1028,6 +1030,16 @@ static struct config_bool ConfigureNamesBool[] = }, { + {"wal_consistency", PGC_SUSET, WAL_SETTINGS, + gettext_noop("Sets consistency of WAL records with existing pages at replay."), + NULL + }, + &wal_consistency, + false, + NULL, NULL, NULL + }, + + { {"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT, gettext_noop("Logs each checkpoint."), NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 7c2daa5..4a98d87 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -191,6 +191,7 @@ # open_sync #full_page_writes = on # recover from partial page writes #wal_compression = off # enable compression of full-page writes +#wal_consistency = off # enables consistency checks at WAL replay #wal_log_hints = off # also do full page writes of non-critical updates # (change requires restart) #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 23ac4e7..a170d01 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -29,7 +29,7 @@ * RmgrNames is an array of resource manager names, to make error messages * a bit nicer. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ name, static const char *RmgrNames[RM_MAX_ID + 1] = { diff --git a/src/bin/pg_xlogdump/rmgrdesc.c b/src/bin/pg_xlogdump/rmgrdesc.c index 8fe20ce..f962e79 100644 --- a/src/bin/pg_xlogdump/rmgrdesc.c +++ b/src/bin/pg_xlogdump/rmgrdesc.c @@ -32,7 +32,7 @@ #include "storage/standbydefs.h" #include "utils/relmapper.h" -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,maskPage) \ { name, desc, identify}, const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = { diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h index f614805..4ad9ab3 100644 --- a/src/include/access/brin_xlog.h +++ b/src/include/access/brin_xlog.h @@ -128,5 +128,6 @@ typedef struct xl_brin_revmap_extend extern void brin_redo(XLogReaderState *record); extern void brin_desc(StringInfo buf, XLogReaderState *record); extern const char *brin_identify(uint8 info); +extern void brin_mask(char *page); #endif /* BRIN_XLOG_H */ diff --git a/src/include/access/gin.h b/src/include/access/gin.h index e5b2e10..6360ba1 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -79,5 +79,6 @@ extern void gin_desc(StringInfo buf, XLogReaderState *record); extern const char *gin_identify(uint8 info); extern void gin_xlog_startup(void); extern void gin_xlog_cleanup(void); +extern void gin_mask(char *page); #endif /* GIN_H */ diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 78e87a6..ccf22a6 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -460,6 +460,7 @@ extern void gist_desc(StringInfo buf, XLogReaderState *record); extern const char *gist_identify(uint8 info); extern void gist_xlog_startup(void); extern void gist_xlog_cleanup(void); +extern void gist_mask(char *page); extern XLogRecPtr gistXLogUpdate(Buffer buffer, OffsetNumber *todelete, int ntodelete, diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h index 5f941a9..3259c71 100644 --- a/src/include/access/hash_xlog.h +++ b/src/include/access/hash_xlog.h @@ -21,5 +21,6 @@ extern void hash_redo(XLogReaderState *record); extern void hash_desc(StringInfo buf, XLogReaderState *record); extern const char *hash_identify(uint8 info); +extern void hash_mask(char *page); #endif /* HASH_XLOG_H */ diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 06a8242..a519dc5 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -373,6 +373,7 @@ extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, extern void heap_redo(XLogReaderState *record); extern void heap_desc(StringInfo buf, XLogReaderState *record); extern const char *heap_identify(uint8 info); +extern void heap_mask(char *page); extern void heap2_redo(XLogReaderState *record); extern void heap2_desc(StringInfo buf, XLogReaderState *record); extern const char *heap2_identify(uint8 info); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index c580f51..53f23d3 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -775,5 +775,6 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); extern void btree_redo(XLogReaderState *record); extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); +extern void btree_mask(char *page); #endif /* NBTREE_H */ diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h index ff7fe62..64b92ff 100644 --- a/src/include/access/rmgr.h +++ b/src/include/access/rmgr.h @@ -19,7 +19,7 @@ typedef uint8 RmgrId; * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG * file format. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ symname, typedef enum RmgrIds diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index a7a0ae2..5509cab 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -25,25 +25,25 @@ */ /* symbol name, textual name, redo, desc, identify, startup, cleanup */ -PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL) -PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL) -PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL) -PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL) -PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL) -PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL) -PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL) -PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL) -PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL) -PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL) -PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL) -PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL) -PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup) -PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup) -PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL) -PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup) -PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL) -PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL) -PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL) -PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL) -PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL) +PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL) +PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL) +PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL) +PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL) +PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL) +PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL) +PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL) +PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL) +PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL) +PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask) +PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask) +PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask) +PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask) +PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask) +PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask) +PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask) +PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask) +PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL) +PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL) +PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, NULL) +PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL) diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index a953a5a..822b094 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -220,5 +220,6 @@ extern void spg_desc(StringInfo buf, XLogReaderState *record); extern const char *spg_identify(uint8 info); extern void spg_xlog_startup(void); extern void spg_xlog_cleanup(void); +extern void spg_mask(char *page); #endif /* SPGIST_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index c9f332c..f8192d2 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -105,6 +105,7 @@ extern bool EnableHotStandby; extern bool fullPageWrites; extern bool wal_log_hints; extern bool wal_compression; +extern bool wal_consistency; extern bool log_checkpoints; extern int CheckPointSegments; diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index ceb0462..1202fbd 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -266,6 +266,10 @@ typedef enum * "VACUUM". rm_desc can then be called to obtain additional detail for the * record, if available (e.g. the last block). * + * rm_mask uses in input a page associated to the resource manager's records + * and performs masking actions on it for consistency check comparisons. + * The input must be an already allocated page of size BLCKSZ. + * * RmgrTable[] is indexed by RmgrId values (see rmgrlist.h). */ typedef struct RmgrData @@ -276,6 +280,7 @@ typedef struct RmgrData const char *(*rm_identify) (uint8 info); void (*rm_startup) (void); void (*rm_cleanup) (void); + void (*rm_mask) (char *page); } RmgrData; extern const RmgrData RmgrTable[]; diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 3dfcb49..74d5aa0 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -56,8 +56,8 @@ typedef struct XLogRecord /* * The high 4 bits in xl_info may be used freely by rmgr. The - * XLR_SPECIAL_REL_UPDATE bit can be passed by XLogInsert caller. The rest - * are set internally by XLogInsert. + * XLR_SPECIAL_REL_UPDATE and XLR_CHECK_CONSISTENCY bits can be passed by + * XLogInsert caller. The rest are set internally by XLogInsert. */ #define XLR_INFO_MASK 0x0F #define XLR_RMGR_INFO_MASK 0xF0 @@ -71,6 +71,15 @@ typedef struct XLogRecord #define XLR_SPECIAL_REL_UPDATE 0x01 /* + * Enforces consistency checks of replayed WAL at recovery. If enabled, + * each record will log a full-page write for each block modified by the + * record and will reuse it afterwards for consistency checks. The caller + * of XLogInsert can use this value if necessary, not that if wal_consistency + * is enabled this is set unconditionally. + */ +#define XLR_CHECK_CONSISTENCY 0x02 + +/* * Header info for block data appended to an XLOG record. * * 'data_length' is the length of the rmgr-specific payload data associated diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index 392a626..f555899 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -82,5 +82,6 @@ extern void ResetSequenceCaches(void); extern void seq_redo(XLogReaderState *rptr); extern void seq_desc(StringInfo buf, XLogReaderState *rptr); extern const char *seq_identify(uint8 info); +extern void seq_mask(char *page); #endif /* SEQUENCE_H */ diff --git a/src/include/storage/bufmask.h b/src/include/storage/bufmask.h new file mode 100644 index 0000000..874c25f --- /dev/null +++ b/src/include/storage/bufmask.h @@ -0,0 +1,25 @@ +/*------------------------------------------------------------------------- + * + * bufmask.h + * Buffer masking definitions. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/bufmask.h + */ + +#ifndef BUFMASK_H +#define BUFMASK_H + +#include "postgres.h" +#include "storage/block.h" +#include "storage/bufmgr.h" + +/* Marker used to mask pages consistently */ +#define MASK_MARKER 0xFF + +extern void mask_page_lsn(Page page); +extern void mask_page_hint_bits(Page page); +extern void mask_unused_space(Page page); +#endif diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index ad4ab5f..a5f34d3 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -435,4 +435,6 @@ extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, extern char *PageSetChecksumCopy(Page page, BlockNumber blkno); extern void PageSetChecksumInplace(Page page, BlockNumber blkno); +extern int comparePages(Page norm_new_page, Page norm_old_page); + #endif /* BUFPAGE_H */ diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm index c1b16ca..d986273 100644 --- a/src/test/perl/PostgresNode.pm +++ b/src/test/perl/PostgresNode.pm @@ -412,6 +412,7 @@ sub init print $conf "log_line_prefix = '%m [%p] %q%a '\n"; print $conf "log_statement = all\n"; print $conf "port = $port\n"; + print $conf "wal_consistency = on\n"; if ($params{allows_streaming}) {