From 65d93a69df59512fab217a6a34a80ed15b33ee27 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 4 Dec 2015 16:58:23 +0900 Subject: [PATCH] Ensure consistent on-disk state of UNLOGGED indexes at recovery Unlogged relation indexes need to have a consistent initial state on-disk at the time of replay to ensure that their replayed pages are found on disk should end of recovery happen and subsequently reset those relations. This commit adds a new XLOG record, named XLOG_FLUSH, that allows to enforce a page just replayed to be flushed to disk, ensuring that those relations will have a consistent state when replayed. All types of relation indexes whose persistence is unlogged are impacted by the bug this commit fixes, with various degrees of problems, most of them causing errors on promoted standbys when trying to INSERT new tuples to their parent relations. The worst problem found was with GIN indexes, where trying to insert a new tuple in it caused the system to remain stuck on a semaphore lock, making the system unresponsive. --- src/backend/access/brin/brin.c | 10 +++++++++- src/backend/access/brin/brin_pageops.c | 2 +- src/backend/access/gin/gininsert.c | 14 +++++++++++--- src/backend/access/gist/gist.c | 3 ++- src/backend/access/heap/rewriteheap.c | 6 ++++-- src/backend/access/nbtree/nbtree.c | 2 +- src/backend/access/nbtree/nbtsort.c | 3 ++- src/backend/access/rmgrdesc/xlogdesc.c | 7 ++++++- src/backend/access/spgist/spginsert.c | 6 +++--- src/backend/access/transam/xlog.c | 28 ++++++++++++++++++++++++++-- src/backend/access/transam/xloginsert.c | 14 ++++++++++---- src/backend/commands/tablecmds.c | 15 +++++++++++---- src/backend/commands/vacuumlazy.c | 2 +- src/backend/replication/logical/decode.c | 1 + src/include/access/xloginsert.h | 6 ++++-- src/include/catalog/pg_control.h | 1 + 16 files changed, 93 insertions(+), 27 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 99337b0..b646101 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -682,7 +682,15 @@ brinbuildempty(PG_FUNCTION_ARGS) brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index), BRIN_CURRENT_VERSION); MarkBufferDirty(metabuf); - log_newpage_buffer(metabuf, false); + + /* + * For unlogged relations, this page should be immediately flushed + * to disk after being replayed. This is necessary to ensure that the + * initial on-disk state of unlogged relations is preserved when + * they get reset at the end of recovery. + */ + log_newpage_buffer(metabuf, false, + index->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED); END_CRIT_SECTION(); UnlockReleaseBuffer(metabuf); diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index f876f62..572fe20 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -865,7 +865,7 @@ brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) page = BufferGetPage(buffer); brin_page_init(page, BRIN_PAGETYPE_REGULAR); MarkBufferDirty(buffer); - log_newpage_buffer(buffer, true); + log_newpage_buffer(buffer, true, false); END_CRIT_SECTION(); /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 49e9185..17c168a 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -450,14 +450,22 @@ ginbuildempty(PG_FUNCTION_ARGS) ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE); - /* Initialize and xlog metabuffer and root buffer. */ + /* + * Initialize and xlog metabuffer and root buffer. For unlogged + * relations, those pages need to be immediately flushed to disk + * after being replayed. This is necessary to ensure that the + * initial on-disk state of unlogged relations is preserved when + * they get reset at the end of recovery. + */ START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); - log_newpage_buffer(MetaBuffer, false); + log_newpage_buffer(MetaBuffer, false, + index->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); - log_newpage_buffer(RootBuffer, false); + log_newpage_buffer(RootBuffer, false, + index->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED); END_CRIT_SECTION(); /* Unlock and release the buffers. */ diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 53bccf6..6a20031 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -84,7 +84,8 @@ gistbuildempty(PG_FUNCTION_ARGS) START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); - log_newpage_buffer(buffer, true); + log_newpage_buffer(buffer, true, + index->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED); END_CRIT_SECTION(); /* Unlock and release the buffer */ diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 6a6fc3b..e9a9a8f 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -336,7 +336,8 @@ end_heap_rewrite(RewriteState state) MAIN_FORKNUM, state->rs_blockno, state->rs_buffer, - true); + true, + false); RelationOpenSmgr(state->rs_new_rel); PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); @@ -685,7 +686,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup) MAIN_FORKNUM, state->rs_blockno, page, - true); + true, + false); /* * Now write the page. We say isTemp = true even if it's not a diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index cf4a6dc..d211a98 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -206,7 +206,7 @@ btbuildempty(PG_FUNCTION_ARGS) (char *) metapage, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - BTREE_METAPAGE, metapage, false); + BTREE_METAPAGE, metapage, false, true); /* * An immediate sync is required even if we xlog'd the page, because the diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index f95f67a..faf611c 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -277,7 +277,8 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) if (wstate->btws_use_wal) { /* We use the heap NEWPAGE record type for this */ - log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true); + log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, + true, false); } /* diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 83cc9e8..a5abf32 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -77,7 +77,9 @@ xlog_desc(StringInfo buf, XLogReaderState *record) appendStringInfoString(buf, xlrec->rp_name); } - else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + else if (info == XLOG_FPI || + info == XLOG_FPI_FOR_HINT || + info == XLOG_FPI_FLUSH) { /* no further information to print */ } @@ -181,6 +183,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_FPI_FLUSH: + id = "FPI_FOR_SYNC"; + break; } return id; diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index bceee8d..0758bfd 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -173,7 +173,7 @@ spgbuildempty(PG_FUNCTION_ARGS) (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - SPGIST_METAPAGE_BLKNO, page, false); + SPGIST_METAPAGE_BLKNO, page, false, true); /* Likewise for the root page. */ SpGistInitPage(page, SPGIST_LEAF); @@ -183,7 +183,7 @@ spgbuildempty(PG_FUNCTION_ARGS) (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - SPGIST_ROOT_BLKNO, page, true); + SPGIST_ROOT_BLKNO, page, true, true); /* Likewise for the null-tuples root page. */ SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); @@ -193,7 +193,7 @@ spgbuildempty(PG_FUNCTION_ARGS) (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - SPGIST_NULL_BLKNO, page, true); + SPGIST_NULL_BLKNO, page, true, true); /* * An immediate sync is required even if we xlog'd the pages, because the diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 86debf4..3640f31 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9188,7 +9188,9 @@ xlog_redo(XLogReaderState *record) XLogRecPtr lsn = record->EndRecPtr; /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */ - Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || + Assert(info == XLOG_FPI || + info == XLOG_FPI_FOR_HINT || + info == XLOG_FPI_FLUSH || !XLogRecHasAnyBlockRefs(record)); if (info == XLOG_NEXTOID) @@ -9378,7 +9380,9 @@ xlog_redo(XLogReaderState *record) { /* nothing to do here */ } - else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + else if (info == XLOG_FPI || + info == XLOG_FPI_FOR_HINT || + info == XLOG_FPI_FLUSH) { Buffer buffer; @@ -9396,9 +9400,29 @@ xlog_redo(XLogReaderState *record) * when checksums are enabled. There is no difference in handling * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info * code just to distinguish them for statistics purposes. + * + * FPI_FOR_FLUSH records indicate that the page immediately needs to + * be written to disk, not just to shared buffers. This is important + * if the on-disk state is to be the authoritative, not the state + * in shared buffers. E.g. because on-disk files may later be copied + * directly. */ if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); + + if (info == XLOG_FPI_FLUSH) + { + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + SMgrRelation srel; + + (void) XLogRecGetBlockTag(record, 0, &rnode, &forknum, &blkno); + srel = smgropen(rnode, InvalidBackendId); + smgrwrite(srel, forknum, blkno, BufferGetPage(buffer), false); + smgrclose(srel); + } + UnlockReleaseBuffer(buffer); } else if (info == XLOG_BACKUP_END) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 925255f..15ed453 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -932,10 +932,13 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) * If the page follows the standard page layout, with a PageHeader and unused * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows * the unused space to be left out from the WAL record, making it smaller. + * + * If 'is_flush' is set to TRUE, relation will be requested to flush + * immediately its data at replay after replaying this full page image. */ XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, - Page page, bool page_std) + Page page, bool page_std, bool is_flush) { int flags; XLogRecPtr recptr; @@ -946,7 +949,7 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, XLogBeginInsert(); XLogRegisterBlock(0, rnode, forkNum, blkno, page, flags); - recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + recptr = XLogInsert(RM_XLOG_ID, is_flush ? XLOG_FPI_FLUSH : XLOG_FPI); /* * The page may be uninitialized. If so, we can't set the LSN because that @@ -969,9 +972,12 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, * If the page follows the standard page layout, with a PageHeader and unused * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows * the unused space to be left out from the WAL record, making it smaller. + * + * If 'is_flush' is set to TRUE, relation will be requested to flush + * immediately its data at replay after replaying this full page image. */ XLogRecPtr -log_newpage_buffer(Buffer buffer, bool page_std) +log_newpage_buffer(Buffer buffer, bool page_std, bool is_flush) { Page page = BufferGetPage(buffer); RelFileNode rnode; @@ -983,7 +989,7 @@ log_newpage_buffer(Buffer buffer, bool page_std) BufferGetTag(buffer, &rnode, &forkNum, &blkno); - return log_newpage(&rnode, forkNum, blkno, page, page_std); + return log_newpage(&rnode, forkNum, blkno, page, page_std, is_flush); } /* diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 0ddde72..dbe7ed9 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -9892,9 +9892,14 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst, /* * We need to log the copied data in WAL iff WAL archiving/streaming is - * enabled AND it's a permanent relation. + * enabled AND it's a permanent relation. Unlogged relations need to have + * their INIT_FORKNUM logged as well, and flushed at replay to ensure a + * consistent on-disk state when reset at the end of recovery. */ - use_wal = XLogIsNeeded() && relpersistence == RELPERSISTENCE_PERMANENT; + use_wal = XLogIsNeeded() && + (relpersistence == RELPERSISTENCE_PERMANENT || + (relpersistence == RELPERSISTENCE_UNLOGGED && + forkNum == INIT_FORKNUM)); nblocks = smgrnblocks(src, forkNum); @@ -9917,10 +9922,12 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst, /* * WAL-log the copied page. Unfortunately we don't know what kind of a * page this is, so we have to log the full page including any unused - * space. + * space. For the same reason, pages part of INIT_FORKNUM are always + * forcibly flushed at replay. */ if (use_wal) - log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false); + log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, + false, forkNum == INIT_FORKNUM); PageSetChecksumInplace(page, blkno); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 2429889..b0e3901 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -736,7 +736,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, */ if (RelationNeedsWAL(onerel) && PageGetLSN(page) == InvalidXLogRecPtr) - log_newpage_buffer(buf, true); + log_newpage_buffer(buf, true, false); PageSetAllVisible(page); visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 9f60687..ba9bb36 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -174,6 +174,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: case XLOG_FPI: + case XLOG_FPI_FLUSH: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 31b45ba..491caa5 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -53,8 +53,10 @@ extern void XLogResetInsertion(void); extern bool XLogCheckBufferNeedsBackup(Buffer buffer); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, - BlockNumber blk, char *page, bool page_std); -extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std); + BlockNumber blk, char *page, bool page_std, + bool is_flush); +extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std, + bool is_flush); extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); extern void InitXLogInsert(void); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index ad1eb4b..9719980 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -73,6 +73,7 @@ typedef struct CheckPoint #define XLOG_END_OF_RECOVERY 0x90 #define XLOG_FPI_FOR_HINT 0xA0 #define XLOG_FPI 0xB0 +#define XLOG_FPI_FLUSH 0xC0 /* -- 2.6.3