From a5c2da1fb4c9b528bc2ea5563cc74b65a5fcc8c5 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 20 Nov 2019 16:21:47 -0800 Subject: [PATCH v25 1/4] Remove dead "pin scan" code from nbtree VACUUM. Finish off the work of commit 3e4b7d87 by completely removing the "pin scan" code previously used by nbtree VACUUM: * Don't track lastBlockVacuumed within nbtree.c VACUUM code anymore. * Remove the lastBlockVacuumed field from xl_btree_vacuum WAL records (nbtree leaf page VACUUM records). * Remove the unnecessary extra call to _bt_delitems_vacuum() made against the last block. This occurred when VACUUM didn't have index tuples to kill on the final block in the index, based on the assumption that a final "pin scan" was still needed. (Clearly a final pin scan can never take place here, since the entire pin scan mechanism was totally disabled by commit 3e4b7d87.) Also, add a new ndeleted metadata field to xl_btree_vacuum, to replace the unneeded lastBlockVacuumed field. This isn't really needed either, since we could continue to infer the array length in nbtxlog.c by using the overall record length. However, it will become useful when the upcoming deduplication patch needs to add an "items updated" field to go alongside it (besides, it doesn't seem like a good idea to leave the xl_btree_vacuum struct without any fields; the C standard says that that's undefined). Discussion: https://postgr.es/m/CAH2-Wzn2pSqEOcBDAA40CnO82oEy-EOpE2bNh_XL_cfFoA86jw@mail.gmail.com --- src/include/access/nbtree.h | 3 +- src/include/access/nbtxlog.h | 25 ++----- src/backend/access/nbtree/nbtpage.c | 35 +++++----- src/backend/access/nbtree/nbtree.c | 74 ++------------------- src/backend/access/nbtree/nbtxlog.c | 95 +-------------------------- src/backend/access/rmgrdesc/nbtdesc.c | 3 +- 6 files changed, 28 insertions(+), 207 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 18a2a3e71c..9833cc10bd 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -779,8 +779,7 @@ extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, - BlockNumber lastBlockVacuumed); + OffsetNumber *deletable, int ndeletable); extern int _bt_pagedel(Relation rel, Buffer buf); /* diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 91b9ee00cf..71435a13b3 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -150,32 +150,17 @@ typedef struct xl_btree_reuse_page * The WAL record can represent deletion of any number of index tuples on a * single index page when executed by VACUUM. * - * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. - * For a non-MVCC index scans there is an additional correctness requirement - * for applying these changes during recovery, which is that we must do one - * of these two things for every block in the index: - * * lock the block for cleanup and apply any required changes - * * EnsureBlockUnpinned() - * The purpose of this is to ensure that no index scans started before we - * finish scanning the index are still running by the time we begin to remove - * heap tuples. - * - * Any changes to any one block are registered on just one WAL record. All - * blocks that we need to run EnsureBlockUnpinned() are listed as a block range - * starting from the last block vacuumed through until this one. Individual - * block numbers aren't given. - * - * Note that the *last* WAL record in any vacuum of an index is allowed to - * have a zero length array of offsets. Earlier records must have at least one. + * Note that the WAL record in any vacuum of an index must have at least one + * item to delete. */ typedef struct xl_btree_vacuum { - BlockNumber lastBlockVacuumed; + uint32 ndeleted; - /* TARGET OFFSET NUMBERS FOLLOW */ + /* DELETED TARGET OFFSET NUMBERS FOLLOW */ } xl_btree_vacuum; -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32)) /* * This is what we need to know about marking an empty branch for deletion. diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 268f869a36..66c79623cf 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -968,32 +968,27 @@ _bt_page_recyclable(Page page) * deleting the page it points to. * * This routine assumes that the caller has pinned and locked the buffer. - * Also, the given itemnos *must* appear in increasing order in the array. + * Also, the given deletable array *must* be sorted in ascending order. * - * We record VACUUMs and b-tree deletes differently in WAL. InHotStandby - * we need to be able to pin all of the blocks in the btree in physical - * order when replaying the effects of a VACUUM, just as we do for the - * original VACUUM itself. lastBlockVacuumed allows us to tell whether an - * intermediate range of blocks has had no changes at all by VACUUM, - * and so must be scanned anyway during replay. We always write a WAL record - * for the last block in the index, whether or not it contained any items - * to be removed. This allows us to scan right up to end of index to - * ensure correct locking. + * We record VACUUMs and b-tree deletes differently in WAL. Deletes must + * generate recovery conflicts by accessing the heap inline, whereas VACUUMs + * can rely on the initial heap scan taking care of the problem (pruning would + * have generated the conflicts needed for hot standby already). */ void -_bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, - BlockNumber lastBlockVacuumed) +_bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, + int ndeletable) { Page page = BufferGetPage(buf); BTPageOpaque opaque; + Assert(ndeletable > 0); + /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* Fix the page */ - if (nitems > 0) - PageIndexMultiDelete(page, itemnos, nitems); + PageIndexMultiDelete(page, deletable, ndeletable); /* * We can clear the vacuum cycle ID since this page has certainly been @@ -1019,7 +1014,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, XLogRecPtr recptr; xl_btree_vacuum xlrec_vacuum; - xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; + xlrec_vacuum.ndeleted = ndeletable; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); @@ -1030,8 +1025,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * is. When XLogInsert stores the whole buffer, the offsets array * need not be stored too. */ - if (nitems > 0) - XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber)); + XLogRegisterBufData(0, (char *) deletable, ndeletable * + sizeof(OffsetNumber)); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); @@ -1050,8 +1045,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * Also, the given itemnos *must* appear in increasing order in the array. * * This is nearly the same as _bt_delitems_vacuum as far as what it does to - * the page, but the WAL logging considerations are quite different. See - * comments for _bt_delitems_vacuum. + * the page, but it needs to generate its own recovery conflicts by accessing + * the heap. See comments for _bt_delitems_vacuum. */ void _bt_delitems_delete(Relation rel, Buffer buf, diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index c67235ab80..bbc1376b0a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -46,8 +46,6 @@ typedef struct IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; - BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */ - BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */ BlockNumber totFreePages; /* true total # of free pages */ TransactionId oldestBtpoXact; MemoryContext pagedelcontext; @@ -978,8 +976,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; - vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ - vstate.lastBlockLocked = BTREE_METAPAGE; vstate.totFreePages = 0; vstate.oldestBtpoXact = InvalidTransactionId; @@ -1040,39 +1036,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } } - /* - * Check to see if we need to issue one final WAL record for this index, - * which may be needed for correctness on a hot standby node when non-MVCC - * index scans could take place. - * - * If the WAL is replayed in hot standby, the replay process needs to get - * cleanup locks on all index leaf pages, just as we've been doing here. - * However, we won't issue any WAL records about pages that have no items - * to be deleted. For pages between pages we've vacuumed, the replay code - * will take locks under the direction of the lastBlockVacuumed fields in - * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one - * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record - * against the last leaf page in the index, if that one wasn't vacuumed. - */ - if (XLogStandbyInfoActive() && - vstate.lastBlockVacuumed < vstate.lastBlockLocked) - { - Buffer buf; - - /* - * The page should be valid, but we can't use _bt_getbuf() because we - * want to use a nondefault buffer access strategy. Since we aren't - * going to delete any items, getting cleanup lock again is probably - * overkill, but for consistency do that anyway. - */ - buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked, - RBM_NORMAL, info->strategy); - LockBufferForCleanup(buf); - _bt_checkpage(rel, buf); - _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); - _bt_relbuf(rel, buf); - } - MemoryContextDelete(vstate.pagedelcontext); /* @@ -1203,13 +1166,6 @@ restart: LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); - /* - * Remember highest leaf page number we've taken cleanup lock on; see - * notes in btvacuumscan - */ - if (blkno > vstate->lastBlockLocked) - vstate->lastBlockLocked = blkno; - /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started @@ -1245,9 +1201,9 @@ restart: htup = &(itup->t_tid); /* - * During Hot Standby we currently assume that - * XLOG_BTREE_VACUUM records do not produce conflicts. That is - * only true as long as the callback function depends only + * During Hot Standby we currently assume that it's okay that + * XLOG_BTREE_VACUUM records do not produce conflicts. This is + * only safe as long as the callback function depends only * upon whether the index tuple refers to heap tuples removed * in the initial heap scan. When vacuum starts it derives a * value of OldestXmin. Backends taking later snapshots could @@ -1276,29 +1232,7 @@ restart: */ if (ndeletable > 0) { - /* - * Notice that the issued XLOG_BTREE_VACUUM WAL record includes - * all information to the replay code to allow it to get a cleanup - * lock on all pages between the previous lastBlockVacuumed and - * this page. This ensures that WAL replay locks all leaf pages at - * some point, which is important should non-MVCC scans be - * requested. This is currently unused on standby, but we record - * it anyway, so that the WAL contains the required information. - * - * Since we can visit leaf pages out-of-order when recursing, - * replay might end up locking such pages an extra time, but it - * doesn't seem worth the amount of bookkeeping it'd take to avoid - * that. - */ - _bt_delitems_vacuum(rel, buf, deletable, ndeletable, - vstate->lastBlockVacuumed); - - /* - * Remember highest leaf page number we've issued a - * XLOG_BTREE_VACUUM WAL record for. - */ - if (blkno > vstate->lastBlockVacuumed) - vstate->lastBlockVacuumed = blkno; + _bt_delitems_vacuum(rel, buf, deletable, ndeletable); stats->tuples_removed += ndeletable; /* must recompute maxoff */ diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 44f6283950..72a601bb22 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -386,107 +386,16 @@ btree_xlog_vacuum(XLogReaderState *record) Buffer buffer; Page page; BTPageOpaque opaque; -#ifdef UNUSED xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); - /* - * This section of code is thought to be no longer needed, after analysis - * of the calling paths. It is retained to allow the code to be reinstated - * if a flaw is revealed in that thinking. - * - * If we are running non-MVCC scans using this index we need to do some - * additional work to ensure correctness, which is known as a "pin scan" - * described in more detail in next paragraphs. We used to do the extra - * work in all cases, whereas we now avoid that work in most cases. If - * lastBlockVacuumed is set to InvalidBlockNumber then we skip the - * additional work required for the pin scan. - * - * Avoiding this extra work is important since it requires us to touch - * every page in the index, so is an O(N) operation. Worse, it is an - * operation performed in the foreground during redo, so it delays - * replication directly. - * - * If queries might be active then we need to ensure every leaf page is - * unpinned between the lastBlockVacuumed and the current block, if there - * are any. This prevents replay of the VACUUM from reaching the stage of - * removing heap tuples while there could still be indexscans "in flight" - * to those particular tuples for those scans which could be confused by - * finding new tuples at the old TID locations (see nbtree/README). - * - * It might be worth checking if there are actually any backends running; - * if not, we could just skip this. - * - * Since VACUUM can visit leaf pages out-of-order, it might issue records - * with lastBlockVacuumed >= block; that's not an error, it just means - * nothing to do now. - * - * Note: since we touch all pages in the range, we will lock non-leaf - * pages, and also any empty (all-zero) pages that may be in the index. It - * doesn't seem worth the complexity to avoid that. But it's important - * that HotStandbyActiveInReplay() will not return true if the database - * isn't yet consistent; so we need not fear reading still-corrupt blocks - * here during crash recovery. - */ - if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed)) - { - RelFileNode thisrnode; - BlockNumber thisblkno; - BlockNumber blkno; - - XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); - - for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) - { - /* - * We use RBM_NORMAL_NO_LOG mode because it's not an error - * condition to see all-zero pages. The original btvacuumpage - * scan would have skipped over all-zero pages, noting them in FSM - * but not bothering to initialize them just yet; so we mustn't - * throw an error here. (We could skip acquiring the cleanup lock - * if PageIsNew, but it's probably not worth the cycles to test.) - * - * XXX we don't actually need to read the block, we just need to - * confirm it is unpinned. If we had a special call into the - * buffer manager we could optimise this so that if the block is - * not in shared_buffers we confirm it as unpinned. Optimizing - * this is now moot, since in most cases we avoid the scan. - */ - buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, - RBM_NORMAL_NO_LOG); - if (BufferIsValid(buffer)) - { - LockBufferForCleanup(buffer); - UnlockReleaseBuffer(buffer); - } - } - } -#endif - - /* - * Like in btvacuumpage(), we need to take a cleanup lock on every leaf - * page. See nbtree/README for details. - */ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { - char *ptr; - Size len; - - ptr = XLogRecGetBlockData(record, 0, &len); + char *ptr = XLogRecGetBlockData(record, 0, NULL); page = (Page) BufferGetPage(buffer); - if (len > 0) - { - OffsetNumber *unused; - OffsetNumber *unend; - - unused = (OffsetNumber *) ptr; - unend = (OffsetNumber *) ((char *) ptr + len); - - if ((unend - unused) > 0) - PageIndexMultiDelete(page, unused, unend - unused); - } + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); /* * Mark the page as not containing any LP_DEAD items --- see comments diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 4ee6d04a68..497f8dc77e 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -46,8 +46,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "lastBlockVacuumed %u", - xlrec->lastBlockVacuumed); + appendStringInfo(buf, "ndeleted %u", xlrec->ndeleted); break; } case XLOG_BTREE_DELETE: -- 2.17.1