From d34737432d11b14acad35dfec1e29b87bb0f0ab4 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 27 Aug 2019 11:44:17 -0700 Subject: [PATCH v2 1/2] Use full 64-bit XID for nbtree page deletion. Otherwise, after a deleted page gets even older, it becomes unrecyclable again. This is the nbtree equivalent of commit 6655a729, which did the same thing within GiST. Stop storing an XID for that tracks the oldest safexid across all deleted pages in an index altogether. There is no longer any point in doing this. It only ever made sense when bpto.xact fields could wraparound. The old btm_oldest_btpo_xact metapage field has been repurposed in a way that preserves on-disk compatibility for pg_upgrade. Rename this uint32 field, and use it to store the number of deleted pages that we expect to be able to recycle during the next btvacuumcleanup() that actually scans the index. This approach is a little unorthodox, but we were already using btm_oldest_btpo_xact (now called btm_last_cleanup_num_delpages) in approximately the same way. And in exactly the same place: inside the _bt_vacuum_needs_cleanup() function. The general assumption is that we ought to be able to recycle however many pages btm_last_cleanup_num_delpages indicates by deciding to scan the index during a btvacuumcleanup() call (_bt_vacuum_needs_cleanup()'s decision). Note that manually issued VACUUMs won't be able to recycle btm_last_cleanup_num_delpages pages (and _bt_vacuum_needs_cleanup() won't instruct btvacuumcleanup() to skip scanning the index) unless at least one XID is consumed between VACUUMs. --- src/include/access/nbtree.h | 88 ++++++++++--- src/include/access/nbtxlog.h | 28 +++-- src/include/storage/standby.h | 2 + src/backend/access/gist/gistxlog.c | 24 +--- src/backend/access/nbtree/nbtinsert.c | 24 ++-- src/backend/access/nbtree/nbtpage.c | 170 ++++++++++++++------------ src/backend/access/nbtree/nbtree.c | 133 ++++++++++---------- src/backend/access/nbtree/nbtsearch.c | 6 +- src/backend/access/nbtree/nbtsort.c | 2 +- src/backend/access/nbtree/nbtxlog.c | 39 +++--- src/backend/access/rmgrdesc/nbtdesc.c | 17 +-- src/backend/storage/ipc/standby.c | 28 +++++ contrib/amcheck/verify_nbtree.c | 76 +++++++----- contrib/pageinspect/btreefuncs.c | 65 +++++++--- contrib/pgstattuple/pgstatindex.c | 8 +- 15 files changed, 432 insertions(+), 278 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index cad4f2bdeb..7b6a897e4a 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -37,8 +37,9 @@ typedef uint16 BTCycleId; * * In addition, we store the page's btree level (counting upwards from * zero at a leaf page) as well as some flag bits indicating the page type - * and status. If the page is deleted, we replace the level with the - * next-transaction-ID value indicating when it is safe to reclaim the page. + * and status. If the page is deleted, a BTDeletedPageContents struct is + * stored in the page's tuple area, while a standard BTPageOpaqueData struct + * is stored in the page special area. * * We also store a "vacuum cycle ID". When a page is split while VACUUM is * processing the index, a nonzero value associated with the VACUUM run is @@ -52,17 +53,24 @@ typedef uint16 BTCycleId; * * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested * instead. + * + * NOTE: the btpo_level field used to be a union type in order to allow + * deleted pages to store a 32-bit safexid is space now used only for page + * level. PostgreSQL 14+ consistently maintain the BTP_LEAF flag, as well as + * the btpo_level field, which can be useful during testing and analysis. + * + * (Actually, that's not quite true. It's still possible for a pg_upgraded'd + * database to have a BTP_DELETED page that's not marked BTP_HAS_FULLXID, in + * which case btpo_level will not in fact store the page level. This limited + * exception is inconsequential -- we simply assume that such a page is safe + * to recycle anyway.) */ typedef struct BTPageOpaqueData { BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ - union - { - uint32 level; /* tree level --- zero for leaf pages */ - TransactionId xact; /* next transaction ID, if deleted */ - } btpo; + uint32 btpo_level; /* tree level --- zero for leaf pages */ uint16 btpo_flags; /* flag bits, see below */ BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ } BTPageOpaqueData; @@ -78,6 +86,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ +#define BTP_HAS_FULLXID (1 << 8) /* page has a BTDeletedPageContents */ /* * The max allowed value of a cycle ID is a bit less than 64K. This is @@ -105,10 +114,12 @@ typedef struct BTMetaPageData BlockNumber btm_fastroot; /* current "fast" root location */ uint32 btm_fastlevel; /* tree level of the "fast" root page */ /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ - TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted - * pages */ - float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples - * during last cleanup */ + + /* number of deleted, non-recyclable pages during last cleanup */ + uint32 btm_last_cleanup_num_delpages; + /* number of heap tuples during last cleanup */ + float8 btm_last_cleanup_num_heap_tuples; + bool btm_allequalimage; /* are all columns "equalimage"? */ } BTMetaPageData; @@ -220,6 +231,55 @@ typedef struct BTMetaPageData #define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) #define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) #define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) +#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0) + +/* + * On a deleted page, we store this struct. A deleted page doesn't contain + * any tuples, so we don't use the normal page layout with line pointers. + * Instead, this struct is stored right after the standard page header. + */ +typedef struct BTDeletedPageContents +{ + /* last xid which could see the page in a scan */ + FullTransactionId safexid; +} BTDeletedPageContents; + +static inline void +BTPageSetDeleted(Page page, FullTransactionId safexid) +{ + BTPageOpaque opaque; + PageHeader header; + BTDeletedPageContents *contents; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + header = ((PageHeader) page); + + opaque->btpo_flags &= ~BTP_HALF_DEAD; + opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID; + header->pd_lower = + MAXALIGN(SizeOfPageHeaderData) + sizeof(BTDeletedPageContents); + header->pd_upper = header->pd_special; + + /* Set safexid */ + contents = ((BTDeletedPageContents *) PageGetContents(page)); + contents->safexid = safexid; +} + +static inline FullTransactionId +BTPageGetDeleteXid(Page page) +{ + BTPageOpaque opaque PG_USED_FOR_ASSERTS_ONLY; + BTDeletedPageContents *contents; + + /* pg_upgrade'd indexes with old BTP_DELETED pages should not call here */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISDELETED(opaque) && !P_ISHALFDEAD(opaque) && + P_HAS_FULLXID(opaque)); + + /* Get safexid */ + contents = ((BTDeletedPageContents *) PageGetContents(page)); + return contents->safexid; +} /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost @@ -1067,7 +1127,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage); extern void _bt_update_meta_cleanup_info(Relation rel, - TransactionId oldestBtpoXact, float8 numHeapTuples); + BlockNumber pages_deleted_not_recycled, + float8 numHeapTuples); extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); @@ -1091,8 +1152,7 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf, extern void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate); -extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf, - TransactionId *oldestBtpoXact); +extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf); /* * prototypes for functions in nbtsearch.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 7ae5c98c2b..5f2bfd3b27 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -13,6 +13,7 @@ #ifndef NBTXLOG_H #define NBTXLOG_H +#include "access/transam.h" #include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/off.h" @@ -52,7 +53,7 @@ typedef struct xl_btree_metadata uint32 level; BlockNumber fastroot; uint32 fastlevel; - TransactionId oldest_btpo_xact; + uint32 last_cleanup_num_delpages; float8 last_cleanup_num_heap_tuples; bool allequalimage; } xl_btree_metadata; @@ -187,7 +188,7 @@ typedef struct xl_btree_reuse_page { RelFileNode node; BlockNumber block; - TransactionId latestRemovedXid; + FullTransactionId latestRemovedFullXid; } xl_btree_reuse_page; #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) @@ -282,9 +283,12 @@ typedef struct xl_btree_mark_page_halfdead #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) /* - * This is what we need to know about deletion of a btree page. Note we do - * not store any content for the deleted page --- it is just rewritten as empty - * during recovery, apart from resetting the btpo.xact. + * This is what we need to know about deletion of a btree page. Note that we + * only leave behind a small amount of bookkeeping information in deleted + * pages (deleted pages must be kept around as tombstones for a while). It is + * convenient for the REDO routine to regenerate its target page from scratch. + * This is why WAL record describes certain details that are actually directly + * available from the target page. * * Backup Blk 0: target block being deleted * Backup Blk 1: target block's left sibling, if any @@ -296,20 +300,24 @@ typedef struct xl_btree_unlink_page { BlockNumber leftsib; /* target block's left sibling, if any */ BlockNumber rightsib; /* target block's right sibling */ + uint32 level; /* target block's level */ /* - * Information needed to recreate the leaf page, when target is an - * internal page. + * Information needed to recreate a half-dead leaf page with correct + * topparent link. The fields are only used when deletion operation's + * target page is an internal page. REDO routine creates half-dead page + * from scratch to keep things simple (this is the same convenient + * approach used for the target page itself). */ BlockNumber leafleftsib; BlockNumber leafrightsib; - BlockNumber topparent; /* next child down in the subtree */ + BlockNumber topparent; - TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ + FullTransactionId safexid; /* BTPageSetDeleted() value */ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ } xl_btree_unlink_page; -#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) +#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, safexid) + sizeof(FullTransactionId)) /* * New root log record. There are zero tuples if this is to establish an diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 94d33851d0..38fd85a431 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -31,6 +31,8 @@ extern void ShutdownRecoveryTransactionEnvironment(void); extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node); +extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid, + RelFileNode node); extern void ResolveRecoveryConflictWithTablespace(Oid tsid); extern void ResolveRecoveryConflictWithDatabase(Oid dbid); diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index f2eda79bc1..1c80eae044 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -394,28 +394,8 @@ gistRedoPageReuse(XLogReaderState *record) * same exclusion effect on primary and standby. */ if (InHotStandby) - { - FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid; - FullTransactionId nextXid = ReadNextFullTransactionId(); - uint64 diff; - - /* - * ResolveRecoveryConflictWithSnapshot operates on 32-bit - * TransactionIds, so truncate the logged FullTransactionId. If the - * logged value is very old, so that XID wrap-around already happened - * on it, there can't be any snapshots that still see it. - */ - diff = U64FromFullTransactionId(nextXid) - - U64FromFullTransactionId(latestRemovedFullXid); - if (diff < MaxTransactionId / 2) - { - TransactionId latestRemovedXid; - - latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid); - ResolveRecoveryConflictWithSnapshot(latestRemovedXid, - xlrec->node); - } - } + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); } void diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index e333603912..1edb9f9579 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1241,7 +1241,7 @@ _bt_insertonpg(Relation rel, metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); - if (metad->btm_fastlevel >= opaque->btpo.level) + if (metad->btm_fastlevel >= opaque->btpo_level) { /* no update wanted */ _bt_relbuf(rel, metabuf); @@ -1268,7 +1268,7 @@ _bt_insertonpg(Relation rel, if (metad->btm_version < BTREE_NOVAC_VERSION) _bt_upgrademetapage(metapg); metad->btm_fastroot = BufferGetBlockNumber(buf); - metad->btm_fastlevel = opaque->btpo.level; + metad->btm_fastlevel = opaque->btpo_level; MarkBufferDirty(metabuf); } @@ -1331,7 +1331,7 @@ _bt_insertonpg(Relation rel, xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; - xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; xlmeta.allequalimage = metad->btm_allequalimage; @@ -1537,7 +1537,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; lopaque->btpo_prev = oopaque->btpo_prev; /* handle btpo_next after rightpage buffer acquired */ - lopaque->btpo.level = oopaque->btpo.level; + lopaque->btpo_level = oopaque->btpo_level; /* handle btpo_cycleid after rightpage buffer acquired */ /* @@ -1722,7 +1722,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = oopaque->btpo_next; - ropaque->btpo.level = oopaque->btpo.level; + ropaque->btpo_level = oopaque->btpo_level; ropaque->btpo_cycleid = lopaque->btpo_cycleid; /* @@ -1950,7 +1950,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, uint8 xlinfo; XLogRecPtr recptr; - xlrec.level = ropaque->btpo.level; + xlrec.level = ropaque->btpo_level; /* See comments below on newitem, orignewitem, and posting lists */ xlrec.firstrightoff = firstrightoff; xlrec.newitemoff = newitemoff; @@ -2142,7 +2142,7 @@ _bt_insert_parent(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false, NULL); + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); @@ -2480,15 +2480,15 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = BTP_ROOT; - rootopaque->btpo.level = - ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1; + rootopaque->btpo_level = + ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1; rootopaque->btpo_cycleid = 0; /* update metapage data */ metad->btm_root = rootblknum; - metad->btm_level = rootopaque->btpo.level; + metad->btm_level = rootopaque->btpo_level; metad->btm_fastroot = rootblknum; - metad->btm_fastlevel = rootopaque->btpo.level; + metad->btm_fastlevel = rootopaque->btpo_level; /* * Insert the left page pointer into the new root page. The root page is @@ -2548,7 +2548,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) md.level = metad->btm_level; md.fastroot = rootblknum; md.fastlevel = metad->btm_level; - md.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; md.allequalimage = metad->btm_allequalimage; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index ac264a5952..00aea725cb 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -37,7 +37,7 @@ static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, - TransactionId latestRemovedXid); + FullTransactionId latestRemovedFullXid); static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid, OffsetNumber *deletable, int ndeletable, @@ -50,7 +50,6 @@ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, - TransactionId *oldestBtpoXact, uint32 *ndeleted); static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, @@ -78,7 +77,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, metad->btm_level = level; metad->btm_fastroot = rootbknum; metad->btm_fastlevel = level; - metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_delpages = 0; metad->btm_last_cleanup_num_heap_tuples = -1.0; metad->btm_allequalimage = allequalimage; @@ -118,7 +117,7 @@ _bt_upgrademetapage(Page page) /* Set version number and fill extra fields added into version 3 */ metad->btm_version = BTREE_NOVAC_VERSION; - metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_delpages = 0; metad->btm_last_cleanup_num_heap_tuples = -1.0; /* Only a REINDEX can set this field */ Assert(!metad->btm_allequalimage); @@ -176,7 +175,8 @@ _bt_getmeta(Relation rel, Buffer metabuf) * to those written in the metapage. On mismatch, metapage is overwritten. */ void -_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, +_bt_update_meta_cleanup_info(Relation rel, + BlockNumber pages_deleted_not_recycled, float8 numHeapTuples) { Buffer metabuf; @@ -185,6 +185,9 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, bool needsRewrite = false; XLogRecPtr recptr; + StaticAssertStmt(sizeof(BlockNumber) == sizeof(TransactionId), + "on-disk compatibility assumption violated"); + /* read the metapage and check if it needs rewrite */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); @@ -193,8 +196,9 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, /* outdated version of metapage always needs rewrite */ if (metad->btm_version < BTREE_NOVAC_VERSION) needsRewrite = true; - else if (metad->btm_oldest_btpo_xact != oldestBtpoXact || - metad->btm_last_cleanup_num_heap_tuples != numHeapTuples) + else if (metad->btm_last_cleanup_num_delpages != pages_deleted_not_recycled) + needsRewrite = true; + else if (metad->btm_last_cleanup_num_heap_tuples != numHeapTuples) needsRewrite = true; if (!needsRewrite) @@ -214,7 +218,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, _bt_upgrademetapage(metapg); /* update cleanup-related information */ - metad->btm_oldest_btpo_xact = oldestBtpoXact; + metad->btm_last_cleanup_num_delpages = pages_deleted_not_recycled; metad->btm_last_cleanup_num_heap_tuples = numHeapTuples; MarkBufferDirty(metabuf); @@ -232,7 +236,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, md.level = metad->btm_level; md.fastroot = metad->btm_fastroot; md.fastlevel = metad->btm_fastlevel; - md.oldest_btpo_xact = oldestBtpoXact; + /* XXX last_cleanup_num_delpages is actually pages_deleted_not_recycled */ + md.last_cleanup_num_delpages = pages_deleted_not_recycled; md.last_cleanup_num_heap_tuples = numHeapTuples; md.allequalimage = metad->btm_allequalimage; @@ -316,7 +321,7 @@ _bt_getroot(Relation rel, int access) * because that's not set in a "fast root". */ if (!P_IGNORE(rootopaque) && - rootopaque->btpo.level == rootlevel && + rootopaque->btpo_level == rootlevel && P_LEFTMOST(rootopaque) && P_RIGHTMOST(rootopaque)) { @@ -377,7 +382,7 @@ _bt_getroot(Relation rel, int access) rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); - rootopaque->btpo.level = 0; + rootopaque->btpo_level = 0; rootopaque->btpo_cycleid = 0; /* Get raw page pointer for metapage */ metapg = BufferGetPage(metabuf); @@ -393,7 +398,7 @@ _bt_getroot(Relation rel, int access) metad->btm_level = 0; metad->btm_fastroot = rootblkno; metad->btm_fastlevel = 0; - metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_delpages = 0; metad->btm_last_cleanup_num_heap_tuples = -1.0; MarkBufferDirty(rootbuf); @@ -416,7 +421,7 @@ _bt_getroot(Relation rel, int access) md.level = 0; md.fastroot = rootblkno; md.fastlevel = 0; - md.oldest_btpo_xact = InvalidTransactionId; + md.last_cleanup_num_delpages = 0; md.last_cleanup_num_heap_tuples = -1.0; md.allequalimage = metad->btm_allequalimage; @@ -481,11 +486,11 @@ _bt_getroot(Relation rel, int access) rootblkno = rootopaque->btpo_next; } - /* Note: can't check btpo.level on deleted pages */ - if (rootopaque->btpo.level != rootlevel) + /* Note: can't check btpo_level from !P_HAS_FULLXID() deleted page */ + if (rootopaque->btpo_level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), - rootopaque->btpo.level, rootlevel); + rootopaque->btpo_level, rootlevel); } /* @@ -585,11 +590,11 @@ _bt_gettrueroot(Relation rel) rootblkno = rootopaque->btpo_next; } - /* Note: can't check btpo.level on deleted pages */ - if (rootopaque->btpo.level != rootlevel) + /* Note: can't check btpo_level from !P_HAS_FULLXID() deleted page */ + if (rootopaque->btpo_level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), - rootopaque->btpo.level, rootlevel); + rootopaque->btpo_level, rootlevel); return rootbuf; } @@ -762,7 +767,8 @@ _bt_checkpage(Relation rel, Buffer buf) * Log the reuse of a page from the FSM. */ static void -_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid) +_bt_log_reuse_page(Relation rel, BlockNumber blkno, + FullTransactionId latestRemovedFullXid) { xl_btree_reuse_page xlrec_reuse; @@ -775,7 +781,7 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX /* XLOG stuff */ xlrec_reuse.node = rel->rd_node; xlrec_reuse.block = blkno; - xlrec_reuse.latestRemovedXid = latestRemovedXid; + xlrec_reuse.latestRemovedFullXid = latestRemovedFullXid; XLogBeginInsert(); XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage); @@ -862,17 +868,18 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) * If we are generating WAL for Hot Standby then create a * WAL record that will allow us to conflict with queries * running on standby, in case they have snapshots older - * than btpo.xact. This can only apply if the page does - * have a valid btpo.xact value, ie not if it's new. (We - * must check that because an all-zero page has no special - * space.) + * than safexid value returned by BTPageGetDeleteXid(). + * This can only apply if the page does have a valid + * safexid value, ie not if it's new. (We must check that + * because an all-zero page has no special space.) */ if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) && !PageIsNew(page)) { - BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + FullTransactionId latestRemovedFullXid; - _bt_log_reuse_page(rel, blkno, opaque->btpo.xact); + latestRemovedFullXid = BTPageGetDeleteXid(page); + _bt_log_reuse_page(rel, blkno, latestRemovedFullXid); } /* Okay to use page. Re-initialize and return it */ @@ -1101,9 +1108,31 @@ _bt_page_recyclable(Page page) * interested in it. */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (P_ISDELETED(opaque) && - GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact)) - return true; + if (P_ISDELETED(opaque)) + { + /* + * If this is a pg_upgrade'd index, then this could be a deleted page + * whose XID (which is stored in special area's level field via type + * punning) is non-full 32-bit value. It's safe to just assume that + * we can recycle because the system must have been restarted since + * the time of deletion. + */ + if (!P_HAS_FULLXID(opaque)) + return true; + + /* + * The page was deleted, but when? If it was just deleted, a scan + * might have seen the downlink to it, and will read the page later. + * As long as that can happen, we must keep the deleted page around as + * a tombstone. + * + * For that check if the deletion XID could still be visible to + * anyone. If not, then no scan that's still in progress could have + * seen its downlink, and we can recycle it. + */ + return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page)); + } + return false; } @@ -1768,16 +1797,12 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) * that the btvacuumscan scan has yet to reach; they'll get counted later * instead. * - * Maintains *oldestBtpoXact for any pages that get deleted. Caller is - * responsible for maintaining *oldestBtpoXact in the case of pages that were - * deleted by a previous VACUUM. - * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ uint32 -_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) +_bt_pagedel(Relation rel, Buffer leafbuf) { uint32 ndeleted = 0; BlockNumber rightsib; @@ -1985,8 +2010,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) { /* Check for interrupts in _bt_unlink_halfdead_page */ if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno, - &rightsib_empty, oldestBtpoXact, - &ndeleted)) + &rightsib_empty, &ndeleted)) { /* * _bt_unlink_halfdead_page should never fail, since we @@ -2001,9 +2025,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) } } - Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque)); - Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact, - *oldestBtpoXact)); + Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque) && + P_HAS_FULLXID(opaque)); rightsib = opaque->btpo_next; @@ -2264,12 +2287,6 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) * containing leafbuf. (We always set *rightsib_empty for caller, just to be * consistent.) * - * We maintain *oldestBtpoXact for pages that are deleted by the current - * VACUUM operation here. This must be handled here because we conservatively - * assume that there needs to be a new call to ReadNewTransactionId() each - * time a page gets deleted. See comments about the underlying assumption - * below. - * * Must hold pin and lock on leafbuf at entry (read or write doesn't matter). * On success exit, we'll be holding pin and write lock. On failure exit, * we'll release both pin and lock before returning (we define it that way @@ -2277,8 +2294,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) */ static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, - bool *rightsib_empty, TransactionId *oldestBtpoXact, - uint32 *ndeleted) + bool *rightsib_empty, uint32 *ndeleted) { BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); BlockNumber leafleftsib; @@ -2294,12 +2310,12 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, BTMetaPageData *metad = NULL; ItemId itemid; Page page; - PageHeader header; BTPageOpaque opaque; + FullTransactionId safexid; bool rightsib_is_rightmost; - int targetlevel; + uint32 targetlevel; IndexTuple leafhikey; - BlockNumber nextchild; + BlockNumber topparent_in_target; page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -2343,7 +2359,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); leftsib = opaque->btpo_prev; - targetlevel = opaque->btpo.level; + targetlevel = opaque->btpo_level; Assert(targetlevel > 0); /* @@ -2450,7 +2466,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque)) elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); - nextchild = InvalidBlockNumber; + + /* Leaf page is also target page: don't set topparent */ + topparent_in_target = InvalidBlockNumber; } else { @@ -2459,11 +2477,13 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); - /* Remember the next non-leaf child down in the subtree */ + /* Internal page is target: we'll set topparent in leaf page... */ itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); - nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid)); - if (nextchild == leafblkno) - nextchild = InvalidBlockNumber; + topparent_in_target = + BTreeTupleGetTopParent((IndexTuple) PageGetItem(page, itemid)); + /* ...except when it would be a redundant pointer-to-self */ + if (topparent_in_target == leafblkno) + topparent_in_target = InvalidBlockNumber; } /* @@ -2553,13 +2573,13 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * no lock was held. */ if (target != leafblkno) - BTreeTupleSetTopParent(leafhikey, nextchild); + BTreeTupleSetTopParent(leafhikey, topparent_in_target); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone. Storing GetTopTransactionId() would work, but * we're in VACUUM and would not otherwise have an XID. Having already - * updated links to the target, ReadNewTransactionId() suffices as an + * updated links to the target, ReadNextFullTransactionId() suffices as an * upper bound. Any scan having retained a now-stale link is advertising * in its PGPROC an xmin less than or equal to the value we read here. It * will continue to do so, holding back the xmin horizon, for the duration @@ -2568,17 +2588,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque)); - opaque->btpo_flags &= ~BTP_HALF_DEAD; - opaque->btpo_flags |= BTP_DELETED; - opaque->btpo.xact = ReadNewTransactionId(); /* - * Remove the remaining tuples on the page. This keeps things simple for - * WAL consistency checking. + * Store upper bound XID that's used to determine when deleted page is no + * longer needed as a tombstone */ - header = (PageHeader) page; - header->pd_lower = SizeOfPageHeaderData; - header->pd_upper = header->pd_special; + safexid = ReadNextFullTransactionId(); + BTPageSetDeleted(page, safexid); + opaque->btpo_cycleid = 0; /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) @@ -2616,15 +2633,16 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target != leafblkno) XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT); - /* information on the unlinked block */ + /* information stored on the target/to-be-unlinked block */ xlrec.leftsib = leftsib; xlrec.rightsib = rightsib; - xlrec.btpo_xact = opaque->btpo.xact; + xlrec.level = targetlevel; + xlrec.safexid = safexid; /* information needed to recreate the leaf block (if not the target) */ xlrec.leafleftsib = leafleftsib; xlrec.leafrightsib = leafrightsib; - xlrec.topparent = nextchild; + xlrec.topparent = topparent_in_target; XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage); @@ -2638,7 +2656,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; - xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; xlmeta.allequalimage = metad->btm_allequalimage; @@ -2681,9 +2699,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, _bt_relbuf(rel, lbuf); _bt_relbuf(rel, rbuf); - if (!TransactionIdIsValid(*oldestBtpoXact) || - TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact)) - *oldestBtpoXact = opaque->btpo.xact; + /* If the target is not leafbuf, we're done with it now -- release it */ + if (target != leafblkno) + _bt_relbuf(rel, buf); /* * If btvacuumscan won't revisit this page in a future btvacuumpage call @@ -2693,10 +2711,6 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target <= scanblkno) (*ndeleted)++; - /* If the target is not leafbuf, we're done with it now -- release it */ - if (target != leafblkno) - _bt_relbuf(rel, buf); - return true; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 289bd3c15d..a9dc9c48dc 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -47,7 +47,6 @@ typedef struct void *callback_state; BTCycleId cycleid; BlockNumber totFreePages; /* true total # of free pages */ - TransactionId oldestBtpoXact; MemoryContext pagedelcontext; } BTVacState; @@ -802,66 +801,69 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) Buffer metabuf; Page metapg; BTMetaPageData *metad; - bool result = false; + BTOptions *relopts; + float8 cleanup_scale_factor; + uint32 btm_version; + BlockNumber prev_pages_deleted_not_recycled; + float8 prev_num_heap_tuples; + /* + * Copy details from metapage to local variables quickly. + * + * Note that we deliberately avoid using cached version of metapage here. + */ metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); + btm_version = metad->btm_version; + + if (btm_version < BTREE_NOVAC_VERSION) + { + /* + * Metapage needs to be dynamically upgraded to store fields that are + * only present when btm_version >= BTREE_NOVAC_VERSION + */ + _bt_relbuf(info->index, metabuf); + return true; + } + + prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + prev_pages_deleted_not_recycled = metad->btm_last_cleanup_num_delpages; + _bt_relbuf(info->index, metabuf); /* - * XXX: If IndexVacuumInfo contained the heap relation, we could be more - * aggressive about vacuuming non catalog relations by passing the table - * to GlobalVisCheckRemovableXid(). + * If table receives enough insertions and no cleanup was performed, + * then index would appear have stale statistics. If scale factor is + * set, we avoid that by performing cleanup if the number of inserted + * tuples exceeds vacuum_cleanup_index_scale_factor fraction of + * original tuples count. */ + relopts = (BTOptions *) info->index->rd_options; + cleanup_scale_factor = (relopts && + relopts->vacuum_cleanup_index_scale_factor >= 0) + ? relopts->vacuum_cleanup_index_scale_factor + : vacuum_cleanup_index_scale_factor; - if (metad->btm_version < BTREE_NOVAC_VERSION) - { - /* - * Do cleanup if metapage needs upgrade, because we don't have - * cleanup-related meta-information yet. - */ - result = true; - } - else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) && - GlobalVisCheckRemovableXid(NULL, metad->btm_oldest_btpo_xact)) - { - /* - * If any oldest btpo.xact from a previously deleted page in the index - * is visible to everyone, then at least one deleted page can be - * recycled -- don't skip cleanup. - */ - result = true; - } - else - { - BTOptions *relopts; - float8 cleanup_scale_factor; - float8 prev_num_heap_tuples; + if (cleanup_scale_factor <= 0 || + info->num_heap_tuples < 0 || + prev_num_heap_tuples <= 0 || + (info->num_heap_tuples - prev_num_heap_tuples) / + prev_num_heap_tuples >= cleanup_scale_factor) + return true; - /* - * If table receives enough insertions and no cleanup was performed, - * then index would appear have stale statistics. If scale factor is - * set, we avoid that by performing cleanup if the number of inserted - * tuples exceeds vacuum_cleanup_index_scale_factor fraction of - * original tuples count. - */ - relopts = (BTOptions *) info->index->rd_options; - cleanup_scale_factor = (relopts && - relopts->vacuum_cleanup_index_scale_factor >= 0) - ? relopts->vacuum_cleanup_index_scale_factor - : vacuum_cleanup_index_scale_factor; - prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + /* + * Trigger cleanup in rare cases where prev_pages_deleted_not_recycled + * exceeds a significant fraction of the total size of the index. We can + * reasonably expect (though are not guaranteed) to be able to recycle + * this many pages during cleanup-only btvacuumscan call. This alone + * might be reason enough to proceed with btvacuumscan call. + */ + Assert(!info->analyze_only); + if (prev_pages_deleted_not_recycled > + RelationGetNumberOfBlocks(info->index) / 20) + return true; - if (cleanup_scale_factor <= 0 || - info->num_heap_tuples < 0 || - prev_num_heap_tuples <= 0 || - (info->num_heap_tuples - prev_num_heap_tuples) / - prev_num_heap_tuples >= cleanup_scale_factor) - result = true; - } - - _bt_relbuf(info->index, metabuf); - return result; + return false; } /* @@ -973,6 +975,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BlockNumber num_pages; BlockNumber scanblkno; bool needLock; + BlockNumber pages_deleted_not_recycled; /* * Reset counts that will be incremented during the scan; needed in case @@ -989,7 +992,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.totFreePages = 0; - vstate.oldestBtpoXact = InvalidTransactionId; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, @@ -1066,18 +1068,16 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexFreeSpaceMapVacuum(rel); /* - * Maintain the oldest btpo.xact and a count of the current number of heap - * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup). + * Maintain the count of the current number of heap tuples in the + * metapage. Also maintain the last pages_deleted_not_recycled. Both + * values are used within _bt_vacuum_needs_cleanup. * - * The page with the oldest btpo.xact is typically a page deleted by this - * VACUUM operation, since pages deleted by a previous VACUUM operation - * tend to be placed in the FSM (by the current VACUUM operation) -- such - * pages are not candidates to be the oldest btpo.xact. (Note that pages - * placed in the FSM are reported as deleted pages in the bulk delete - * statistics, despite not counting as deleted pages for the purposes of - * determining the oldest btpo.xact.) + * pages_deleted_not_recycled is the number of deleted pages now in the + * index that were not safe to place in the FSM to be recycled just yet. */ - _bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact, + pages_deleted_not_recycled = stats->pages_deleted - vstate.totFreePages; + Assert(stats->pages_deleted >= vstate.totFreePages); + _bt_update_meta_cleanup_info(rel, pages_deleted_not_recycled, info->num_heap_tuples); /* update statistics */ @@ -1203,17 +1203,12 @@ backtrack: * recycle yet. */ stats->pages_deleted++; - - /* Maintain the oldest btpo.xact */ - if (!TransactionIdIsValid(vstate->oldestBtpoXact) || - TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) - vstate->oldestBtpoXact = opaque->btpo.xact; } else if (P_ISHALFDEAD(opaque)) { /* * Half-dead leaf page. Try to delete now. Might update - * oldestBtpoXact and pages_deleted below. + * pages_deleted below. */ attempt_pagedel = true; } @@ -1430,7 +1425,7 @@ backtrack: * count. There will be no double-counting. */ Assert(blkno == scanblkno); - stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact); + stats->pages_deleted += _bt_pagedel(rel, buf); MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 2e3bda8171..d1177d8772 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -169,7 +169,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, * we're on the level 1 and asked to lock leaf page in write mode, * then lock next page in write mode, because it must be a leaf. */ - if (opaque->btpo.level == 1 && access == BT_WRITE) + if (opaque->btpo_level == 1 && access == BT_WRITE) page_access = BT_WRITE; /* drop the read lock on the page, then acquire one on its child */ @@ -2341,9 +2341,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, } /* Done? */ - if (opaque->btpo.level == level) + if (opaque->btpo_level == level) break; - if (opaque->btpo.level < level) + if (opaque->btpo_level < level) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("btree level %u not found in index \"%s\"", diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 5683daa34d..2c4d7f6e25 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -620,7 +620,7 @@ _bt_blnewpage(uint32 level) /* Initialize BT opaque state */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_prev = opaque->btpo_next = P_NONE; - opaque->btpo.level = level; + opaque->btpo_level = level; opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; opaque->btpo_cycleid = 0; diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index c1d578cc01..b6afe9526e 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -112,7 +112,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) md->btm_fastlevel = xlrec->fastlevel; /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */ Assert(md->btm_version >= BTREE_NOVAC_VERSION); - md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; + md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages; md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; md->btm_allequalimage = xlrec->allequalimage; @@ -297,7 +297,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = spagenumber; - ropaque->btpo.level = xlrec->level; + ropaque->btpo_level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; @@ -773,7 +773,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) pageop->btpo_prev = xlrec->leftblk; pageop->btpo_next = xlrec->rightblk; - pageop->btpo.level = 0; + pageop->btpo_level = 0; pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_cycleid = 0; @@ -802,6 +802,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); BlockNumber leftsib; BlockNumber rightsib; + uint32 level; + bool isleaf; + FullTransactionId safexid; Buffer leftbuf; Buffer target; Buffer rightbuf; @@ -810,6 +813,12 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) leftsib = xlrec->leftsib; rightsib = xlrec->rightsib; + level = xlrec->level; + isleaf = (level == 0); + safexid = xlrec->safexid; + + /* No topparent link for leaf page (level 0) or level 1 */ + Assert(xlrec->topparent == InvalidBlockNumber || level > 1); /* * In normal operation, we would lock all the pages this WAL record @@ -844,9 +853,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) pageop->btpo_prev = leftsib; pageop->btpo_next = rightsib; - pageop->btpo.xact = xlrec->btpo_xact; - pageop->btpo_flags = BTP_DELETED; - if (!BlockNumberIsValid(xlrec->topparent)) + pageop->btpo_level = level; + BTPageSetDeleted(page, safexid); + if (isleaf) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; @@ -892,6 +901,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) Buffer leafbuf; IndexTupleData trunctuple; + Assert(!isleaf); + leafbuf = XLogInitBufferForRedo(record, 3); page = (Page) BufferGetPage(leafbuf); @@ -901,7 +912,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_prev = xlrec->leafleftsib; pageop->btpo_next = xlrec->leafrightsib; - pageop->btpo.level = 0; + pageop->btpo_level = 0; pageop->btpo_cycleid = 0; /* Add a dummy hikey item */ @@ -942,7 +953,7 @@ btree_xlog_newroot(XLogReaderState *record) pageop->btpo_flags = BTP_ROOT; pageop->btpo_prev = pageop->btpo_next = P_NONE; - pageop->btpo.level = xlrec->level; + pageop->btpo_level = xlrec->level; if (xlrec->level == 0) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; @@ -972,17 +983,15 @@ btree_xlog_reuse_page(XLogReaderState *record) * Btree reuse_page records exist to provide a conflict point when we * reuse pages in the index via the FSM. That's all they do though. * - * latestRemovedXid was the page's btpo.xact. The - * GlobalVisCheckRemovableXid test in _bt_page_recyclable() conceptually - * mirrors the pgxact->xmin > limitXmin test in + * latestRemovedXid was the page's deleteXid. The + * GlobalVisCheckRemovableFullXid(deleteXid) test in _bt_page_recyclable() + * conceptually mirrors the PGPROC->xmin > limitXmin test in * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the * same exclusion effect on primary and standby. */ if (InHotStandby) - { - ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, - xlrec->node); - } + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); } void diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 6e0d6a2b72..5cce10a5b6 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -80,9 +80,10 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec; - appendStringInfo(buf, "left %u; right %u; btpo_xact %u; ", - xlrec->leftsib, xlrec->rightsib, - xlrec->btpo_xact); + appendStringInfo(buf, "left %u; right %u; level %u; safexid %u:%u; ", + xlrec->leftsib, xlrec->rightsib, xlrec->level, + EpochFromFullTransactionId(xlrec->safexid), + XidFromFullTransactionId(xlrec->safexid)); appendStringInfo(buf, "leafleft %u; leafright %u; topparent %u", xlrec->leafleftsib, xlrec->leafrightsib, xlrec->topparent); @@ -99,9 +100,11 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec; - appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u", + appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u", xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->latestRemovedXid); + xlrec->node.relNode, + EpochFromFullTransactionId(xlrec->latestRemovedFullXid), + XidFromFullTransactionId(xlrec->latestRemovedFullXid)); break; } case XLOG_BTREE_META_CLEANUP: @@ -110,8 +113,8 @@ btree_desc(StringInfo buf, XLogReaderState *record) xlrec = (xl_btree_metadata *) XLogRecGetBlockData(record, 0, NULL); - appendStringInfo(buf, "oldest_btpo_xact %u; last_cleanup_num_heap_tuples: %f", - xlrec->oldest_btpo_xact, + appendStringInfo(buf, "last_cleanup_num_delpages %u; last_cleanup_num_heap_tuples: %f", + xlrec->last_cleanup_num_delpages, xlrec->last_cleanup_num_heap_tuples); break; } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 39a30c00f7..0eeb766943 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -452,6 +452,34 @@ ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode true); } +/* + * Variant of ResolveRecoveryConflictWithSnapshot that works with + * FullTransactionId values + */ +void +ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid, + RelFileNode node) +{ + /* + * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds, + * so truncate the logged FullTransactionId. If the logged value is very + * old, so that XID wrap-around already happened on it, there can't be any + * snapshots that still see it. + */ + FullTransactionId nextXid = ReadNextFullTransactionId(); + uint64 diff; + + diff = U64FromFullTransactionId(nextXid) - + U64FromFullTransactionId(latestRemovedFullXid); + if (diff < MaxTransactionId / 2) + { + TransactionId latestRemovedXid; + + latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node); + } +} + void ResolveRecoveryConflictWithTablespace(Oid tsid) { diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index b8c7793d9e..c184ccb323 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -769,7 +769,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(state->target, itemid); nextleveldown.leftmost = BTreeTupleGetDownLink(itup); - nextleveldown.level = opaque->btpo.level - 1; + nextleveldown.level = opaque->btpo_level - 1; } else { @@ -795,13 +795,13 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); /* Check level, which must be valid for non-ignorable page */ - if (level.level != opaque->btpo.level) + if (level.level != opaque->btpo_level) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", - current, level.level, opaque->btpo.level))); + current, level.level, opaque->btpo_level))); /* Verify invariants for page */ bt_target_page_check(state); @@ -1167,7 +1167,7 @@ bt_target_page_check(BtreeCheckState *state) bt_child_highkey_check(state, offset, NULL, - topaque->btpo.level); + topaque->btpo_level); } continue; } @@ -1529,7 +1529,7 @@ bt_target_page_check(BtreeCheckState *state) if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) { bt_child_highkey_check(state, InvalidOffsetNumber, - NULL, topaque->btpo.level); + NULL, topaque->btpo_level); } } @@ -1606,7 +1606,7 @@ bt_right_page_check_scankey(BtreeCheckState *state) ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg("level %u leftmost page of index \"%s\" was found deleted or half dead", - opaque->btpo.level, RelationGetRelationName(state->rel)), + opaque->btpo_level, RelationGetRelationName(state->rel)), errdetail_internal("Deleted page found when building scankey from right sibling."))); /* Be slightly more pro-active in freeing this memory, just in case */ @@ -1911,13 +1911,13 @@ bt_child_highkey_check(BtreeCheckState *state, (uint32) state->targetlsn))); /* Check level for non-ignorable page */ - if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1) + if (!P_IGNORE(opaque) && opaque->btpo_level != target_level - 1) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block found while following rightlinks from child of index \"%s\" has invalid level", RelationGetRelationName(state->rel)), errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", - blkno, target_level - 1, opaque->btpo.level))); + blkno, target_level - 1, opaque->btpo_level))); /* Try to detect circular links */ if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev) @@ -2145,7 +2145,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, * check for downlink connectivity. */ bt_child_highkey_check(state, downlinkoffnum, - child, topaque->btpo.level); + child, topaque->btpo_level); /* * Since there cannot be a concurrent VACUUM operation in readonly mode, @@ -2290,7 +2290,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, errmsg("harmless interrupted page split detected in index %s", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", - blkno, opaque->btpo.level, + blkno, opaque->btpo_level, opaque->btpo_prev, (uint32) (pagelsn >> 32), (uint32) pagelsn))); @@ -2321,7 +2321,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", RelationGetRelationName(state->rel)); - level = opaque->btpo.level; + level = opaque->btpo_level; itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(page, itemid); childblk = BTreeTupleGetDownLink(itup); @@ -2336,16 +2336,16 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, break; /* Do an extra sanity check in passing on internal pages */ - if (copaque->btpo.level != level - 1) + if (copaque->btpo_level != level - 1) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", blkno, childblk, - level - 1, copaque->btpo.level))); + level - 1, copaque->btpo_level))); - level = copaque->btpo.level; + level = copaque->btpo_level; itemid = PageGetItemIdCareful(state, childblk, child, P_FIRSTDATAKEY(copaque)); itup = (IndexTuple) PageGetItem(child, itemid); @@ -2407,7 +2407,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, errmsg("internal index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u page lsn=%X/%X.", - blkno, opaque->btpo.level, + blkno, opaque->btpo_level, (uint32) (pagelsn >> 32), (uint32) pagelsn))); } @@ -3002,21 +3002,26 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) } /* - * Deleted pages have no sane "level" field, so can only check non-deleted - * page level + * Deleted pages that still use the old 32-bit XID representation have no + * sane "level" field because they type pun the field, but all other pages + * (including pages deleted on Postgres 14+) have a valid value. */ - if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid leaf page level %u for block %u in index \"%s\"", - opaque->btpo.level, blocknum, RelationGetRelationName(state->rel)))); + if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) + { + /* Okay, no reason not to trust btpo_level field from page */ - if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && - opaque->btpo.level == 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid internal page level 0 for block %u in index \"%s\"", - blocknum, RelationGetRelationName(state->rel)))); + if (P_ISLEAF(opaque) && opaque->btpo_level != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid leaf page level %u for block %u in index \"%s\"", + opaque->btpo_level, blocknum, RelationGetRelationName(state->rel)))); + + if (!P_ISLEAF(opaque) && opaque->btpo_level == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid internal page level 0 for block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)))); + } /* * Sanity checks for number of items on page. @@ -3064,7 +3069,8 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) * from version 9.4 on, so do the same here. See _bt_pagedel() for full * details. * - * Internal pages should never have garbage items, either. + * Also check that internal pages have no garbage items, and that no page + * has an invalid combination of page deletion related page level flags. */ if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)) ereport(ERROR, @@ -3079,6 +3085,18 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) errmsg("internal page block %u in index \"%s\" has garbage items", blocknum, RelationGetRelationName(state->rel)))); + if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)))); + + if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("deleted page block %u in index \"%s\" is half-dead", + blocknum, RelationGetRelationName(state->rel)))); + return page; } diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 8bb180bbbe..dfac1a9716 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -75,11 +75,7 @@ typedef struct BTPageStat /* opaque data */ BlockNumber btpo_prev; BlockNumber btpo_next; - union - { - uint32 level; - TransactionId xact; - } btpo; + uint32 btpo_level; uint16 btpo_flags; BTCycleId btpo_cycleid; } BTPageStat; @@ -112,9 +108,33 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) /* page type (flags) */ if (P_ISDELETED(opaque)) { - stat->type = 'd'; - stat->btpo.xact = opaque->btpo.xact; - return; + /* We divide deleted pages into leaf ('d') or internal ('D') */ + if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque)) + stat->type = 'd'; + else + stat->type = 'D'; + + /* + * Report safexid in a deleted page. + * + * Handle pg_upgrade'd deleted pages that used the previous safexid + * representation in btpo_level field (this used to be a union type + * called "bpto"). + */ + if (P_HAS_FULLXID(opaque)) + { + FullTransactionId safexid = BTPageGetDeleteXid(page); + + elog(NOTICE, "deleted page from block %u has safexid %u:%u", + blkno, EpochFromFullTransactionId(safexid), + XidFromFullTransactionId(safexid)); + } + else + elog(NOTICE, "deleted page from block %u has safexid %u", + blkno, opaque->btpo_level); + + /* Don't interpret BTDeletedPageContents as index tuples */ + maxoff = InvalidOffsetNumber; } else if (P_IGNORE(opaque)) stat->type = 'e'; @@ -128,7 +148,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) /* btpage opaque data */ stat->btpo_prev = opaque->btpo_prev; stat->btpo_next = opaque->btpo_next; - stat->btpo.level = opaque->btpo.level; + stat->btpo_level = opaque->btpo_level; stat->btpo_flags = opaque->btpo_flags; stat->btpo_cycleid = opaque->btpo_cycleid; @@ -237,7 +257,8 @@ bt_page_stats_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) values[j++] = psprintf("%u", stat.free_size); values[j++] = psprintf("%u", stat.btpo_prev); values[j++] = psprintf("%u", stat.btpo_next); - values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level); + /* The "btpo" field now only stores btpo_level, never an xact */ + values[j++] = psprintf("%u", stat.btpo_level); values[j++] = psprintf("%d", stat.btpo_flags); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), @@ -503,10 +524,14 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page); - if (P_ISDELETED(opaque)) - elog(NOTICE, "page is deleted"); - - fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + if (!P_ISDELETED(opaque)) + fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + else + { + /* Don't interpret BTDeletedPageContents as index tuples */ + elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno); + fctx->max_calls = 0; + } uargs->leafpage = P_ISLEAF(opaque); uargs->rightmost = P_RIGHTMOST(opaque); @@ -603,7 +628,14 @@ bt_page_items_bytea(PG_FUNCTION_ARGS) if (P_ISDELETED(opaque)) elog(NOTICE, "page is deleted"); - fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + if (!P_ISDELETED(opaque)) + fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + else + { + /* Don't interpret BTDeletedPageContents as index tuples */ + elog(NOTICE, "page from block is deleted"); + fctx->max_calls = 0; + } uargs->leafpage = P_ISLEAF(opaque); uargs->rightmost = P_RIGHTMOST(opaque); @@ -723,7 +755,8 @@ bt_metap(PG_FUNCTION_ARGS) */ if (metad->btm_version >= BTREE_NOVAC_VERSION) { - values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact); + /* XXX: btm_last_cleanup_num_delpages used to be btm_oldest_btpo_xact */ + values[j++] = psprintf("%u", metad->btm_last_cleanup_num_delpages); values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples); values[j++] = metad->btm_allequalimage ? "t" : "f"; } diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c index b1ce0d77d7..5368bb30f0 100644 --- a/contrib/pgstattuple/pgstatindex.c +++ b/contrib/pgstattuple/pgstatindex.c @@ -283,8 +283,12 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo) page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - /* Determine page type, and update totals */ - + /* + * Determine page type, and update totals. + * + * Note that we arbitrarily bucket deleted pages together without + * considering if they're leaf pages or internal pages. + */ if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) -- 2.27.0