From 39ef90d96d0c061b2e537c4cdc9899e4770c3023 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 27 Aug 2019 11:44:17 -0700 Subject: [PATCH v1 1/2] Use full 64-bit XID for nbtree page deletion. Otherwise, after a deleted page gets even older, it becomes unrecyclable again. This is the nbtree equivalent of commit 6655a729, which did the same thing within GiST. --- src/include/access/nbtree.h | 79 ++++++++++-- src/include/access/nbtxlog.h | 26 ++-- src/include/storage/standby.h | 2 + src/backend/access/gist/gistxlog.c | 24 +--- src/backend/access/nbtree/nbtinsert.c | 20 +-- src/backend/access/nbtree/nbtpage.c | 167 +++++++++++++++----------- src/backend/access/nbtree/nbtree.c | 47 +++++--- src/backend/access/nbtree/nbtsearch.c | 6 +- src/backend/access/nbtree/nbtsort.c | 2 +- src/backend/access/nbtree/nbtxlog.c | 37 +++--- src/backend/access/rmgrdesc/nbtdesc.c | 13 +- src/backend/storage/ipc/standby.c | 28 +++++ contrib/amcheck/verify_nbtree.c | 71 ++++++----- contrib/pageinspect/btreefuncs.c | 62 +++++++--- contrib/pgstattuple/pgstatindex.c | 8 +- 15 files changed, 384 insertions(+), 208 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index cad4f2bdeb..17083e9d76 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -37,8 +37,9 @@ typedef uint16 BTCycleId; * * In addition, we store the page's btree level (counting upwards from * zero at a leaf page) as well as some flag bits indicating the page type - * and status. If the page is deleted, we replace the level with the - * next-transaction-ID value indicating when it is safe to reclaim the page. + * and status. If the page is deleted, a BTDeletedPageContents struct is + * stored in the page's tuple area, while a standard BTPageOpaqueData struct + * is stored in the page special area. * * We also store a "vacuum cycle ID". When a page is split while VACUUM is * processing the index, a nonzero value associated with the VACUUM run is @@ -52,17 +53,24 @@ typedef uint16 BTCycleId; * * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested * instead. + * + * NOTE: the btpo_level field used to be a union type in order to allow + * deleted pages to store a 32-bit safexid is space now used only for page + * level. PostgreSQL 14+ consistently maintain the BTP_LEAF flag, as well as + * the btpo_level field, which can be useful during testing and analysis. + * + * (Actually, that's not quite true. It's still possible for a pg_upgraded'd + * database to have a BTP_DELETED page that's not marked BTP_HAS_FULLXID, in + * which case btpo_level will not in fact store the page level. This limited + * exception is inconsequential -- we simply assume that such a page is safe + * to recycle anyway.) */ typedef struct BTPageOpaqueData { BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ - union - { - uint32 level; /* tree level --- zero for leaf pages */ - TransactionId xact; /* next transaction ID, if deleted */ - } btpo; + uint32 btpo_level; /* tree level --- zero for leaf pages */ uint16 btpo_flags; /* flag bits, see below */ BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ } BTPageOpaqueData; @@ -78,6 +86,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ +#define BTP_HAS_FULLXID (1 << 8) /* page has a BTDeletedPageContents */ /* * The max allowed value of a cycle ID is a bit less than 64K. This is @@ -105,8 +114,7 @@ typedef struct BTMetaPageData BlockNumber btm_fastroot; /* current "fast" root location */ uint32 btm_fastlevel; /* tree level of the "fast" root page */ /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ - TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted - * pages */ + TransactionId btm_oldest_btpo_xact; /* oldest xid among deleted pages */ float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples * during last cleanup */ bool btm_allequalimage; /* are all columns "equalimage"? */ @@ -220,6 +228,55 @@ typedef struct BTMetaPageData #define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) #define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) #define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) +#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0) + +/* + * On a deleted page, we store this struct. A deleted page doesn't contain + * any tuples, so we don't use the normal page layout with line pointers. + * Instead, this struct is stored right after the standard page header. + */ +typedef struct BTDeletedPageContents +{ + /* last xid which could see the page in a scan */ + FullTransactionId safexid; +} BTDeletedPageContents; + +static inline void +BTPageSetDeleted(Page page, FullTransactionId safexid) +{ + BTPageOpaque opaque; + PageHeader header; + BTDeletedPageContents *contents; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + header = ((PageHeader) page); + + opaque->btpo_flags &= ~BTP_HALF_DEAD; + opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID; + header->pd_lower = + MAXALIGN(SizeOfPageHeaderData) + sizeof(BTDeletedPageContents); + header->pd_upper = header->pd_special; + + /* Set safexid */ + contents = ((BTDeletedPageContents *) PageGetContents(page)); + contents->safexid = safexid; +} + +static inline FullTransactionId +BTPageGetDeleteXid(Page page) +{ + BTPageOpaque opaque PG_USED_FOR_ASSERTS_ONLY; + BTDeletedPageContents *contents; + + /* pg_upgrade'd indexes with old BTP_DELETED pages should not call here */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISDELETED(opaque) && !P_ISHALFDEAD(opaque) && + P_HAS_FULLXID(opaque)); + + /* Get safexid */ + contents = ((BTDeletedPageContents *) PageGetContents(page)); + return contents->safexid; +} /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost @@ -1067,7 +1124,7 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage); extern void _bt_update_meta_cleanup_info(Relation rel, - TransactionId oldestBtpoXact, float8 numHeapTuples); + FullTransactionId oldestSafeXid, float8 numHeapTuples); extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); @@ -1092,7 +1149,7 @@ extern void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate); extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf, - TransactionId *oldestBtpoXact); + FullTransactionId *oldestSafeXid); /* * prototypes for functions in nbtsearch.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 7ae5c98c2b..1ae13dd2dd 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -13,6 +13,7 @@ #ifndef NBTXLOG_H #define NBTXLOG_H +#include "access/transam.h" #include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/off.h" @@ -187,7 +188,7 @@ typedef struct xl_btree_reuse_page { RelFileNode node; BlockNumber block; - TransactionId latestRemovedXid; + FullTransactionId latestRemovedFullXid; } xl_btree_reuse_page; #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) @@ -282,9 +283,12 @@ typedef struct xl_btree_mark_page_halfdead #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) /* - * This is what we need to know about deletion of a btree page. Note we do - * not store any content for the deleted page --- it is just rewritten as empty - * during recovery, apart from resetting the btpo.xact. + * This is what we need to know about deletion of a btree page. Note that we + * only leave behind a small amount of bookkeeping information in deleted + * pages (deleted pages must be kept around as tombstones for a while). It is + * convenient for the REDO routine to regenerate its target page from scratch. + * This is why WAL record describes certain details that are actually directly + * available from the target page. * * Backup Blk 0: target block being deleted * Backup Blk 1: target block's left sibling, if any @@ -296,20 +300,24 @@ typedef struct xl_btree_unlink_page { BlockNumber leftsib; /* target block's left sibling, if any */ BlockNumber rightsib; /* target block's right sibling */ + uint32 level; /* target block's level */ /* - * Information needed to recreate the leaf page, when target is an - * internal page. + * Information needed to recreate a half-dead leaf page with correct + * topparent link. The fields are only used when deletion operation's + * target page is an internal page. REDO routine creates half-dead page + * from scratch to keep things simple (this is the same convenient + * approach used for the target page itself). */ BlockNumber leafleftsib; BlockNumber leafrightsib; - BlockNumber topparent; /* next child down in the subtree */ + BlockNumber topparent; - TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ + FullTransactionId safexid; /* BTPageSetDeleted() value */ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ } xl_btree_unlink_page; -#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) +#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, safexid) + sizeof(FullTransactionId)) /* * New root log record. There are zero tuples if this is to establish an diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 94d33851d0..38fd85a431 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -31,6 +31,8 @@ extern void ShutdownRecoveryTransactionEnvironment(void); extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node); +extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid, + RelFileNode node); extern void ResolveRecoveryConflictWithTablespace(Oid tsid); extern void ResolveRecoveryConflictWithDatabase(Oid dbid); diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index f2eda79bc1..1c80eae044 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -394,28 +394,8 @@ gistRedoPageReuse(XLogReaderState *record) * same exclusion effect on primary and standby. */ if (InHotStandby) - { - FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid; - FullTransactionId nextXid = ReadNextFullTransactionId(); - uint64 diff; - - /* - * ResolveRecoveryConflictWithSnapshot operates on 32-bit - * TransactionIds, so truncate the logged FullTransactionId. If the - * logged value is very old, so that XID wrap-around already happened - * on it, there can't be any snapshots that still see it. - */ - diff = U64FromFullTransactionId(nextXid) - - U64FromFullTransactionId(latestRemovedFullXid); - if (diff < MaxTransactionId / 2) - { - TransactionId latestRemovedXid; - - latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid); - ResolveRecoveryConflictWithSnapshot(latestRemovedXid, - xlrec->node); - } - } + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); } void diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index e333603912..af96c09f46 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1241,7 +1241,7 @@ _bt_insertonpg(Relation rel, metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); - if (metad->btm_fastlevel >= opaque->btpo.level) + if (metad->btm_fastlevel >= opaque->btpo_level) { /* no update wanted */ _bt_relbuf(rel, metabuf); @@ -1268,7 +1268,7 @@ _bt_insertonpg(Relation rel, if (metad->btm_version < BTREE_NOVAC_VERSION) _bt_upgrademetapage(metapg); metad->btm_fastroot = BufferGetBlockNumber(buf); - metad->btm_fastlevel = opaque->btpo.level; + metad->btm_fastlevel = opaque->btpo_level; MarkBufferDirty(metabuf); } @@ -1537,7 +1537,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; lopaque->btpo_prev = oopaque->btpo_prev; /* handle btpo_next after rightpage buffer acquired */ - lopaque->btpo.level = oopaque->btpo.level; + lopaque->btpo_level = oopaque->btpo_level; /* handle btpo_cycleid after rightpage buffer acquired */ /* @@ -1722,7 +1722,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = oopaque->btpo_next; - ropaque->btpo.level = oopaque->btpo.level; + ropaque->btpo_level = oopaque->btpo_level; ropaque->btpo_cycleid = lopaque->btpo_cycleid; /* @@ -1950,7 +1950,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, uint8 xlinfo; XLogRecPtr recptr; - xlrec.level = ropaque->btpo.level; + xlrec.level = ropaque->btpo_level; /* See comments below on newitem, orignewitem, and posting lists */ xlrec.firstrightoff = firstrightoff; xlrec.newitemoff = newitemoff; @@ -2142,7 +2142,7 @@ _bt_insert_parent(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false, NULL); + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); @@ -2480,15 +2480,15 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = BTP_ROOT; - rootopaque->btpo.level = - ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1; + rootopaque->btpo_level = + ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1; rootopaque->btpo_cycleid = 0; /* update metapage data */ metad->btm_root = rootblknum; - metad->btm_level = rootopaque->btpo.level; + metad->btm_level = rootopaque->btpo_level; metad->btm_fastroot = rootblknum; - metad->btm_fastlevel = rootopaque->btpo.level; + metad->btm_fastlevel = rootopaque->btpo_level; /* * Insert the left page pointer into the new root page. The root page is diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index ac264a5952..86652fff29 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -37,7 +37,7 @@ static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, - TransactionId latestRemovedXid); + FullTransactionId latestRemovedFullXid); static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid, OffsetNumber *deletable, int ndeletable, @@ -50,7 +50,7 @@ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, - TransactionId *oldestBtpoXact, + FullTransactionId *oldestSafeXid, uint32 *ndeleted); static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, @@ -176,7 +176,7 @@ _bt_getmeta(Relation rel, Buffer metabuf) * to those written in the metapage. On mismatch, metapage is overwritten. */ void -_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, +_bt_update_meta_cleanup_info(Relation rel, FullTransactionId oldestSafeXid, float8 numHeapTuples) { Buffer metabuf; @@ -184,6 +184,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, BTMetaPageData *metad; bool needsRewrite = false; XLogRecPtr recptr; + TransactionId oldestXid = XidFromFullTransactionId(oldestSafeXid); /* read the metapage and check if it needs rewrite */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); @@ -193,7 +194,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, /* outdated version of metapage always needs rewrite */ if (metad->btm_version < BTREE_NOVAC_VERSION) needsRewrite = true; - else if (metad->btm_oldest_btpo_xact != oldestBtpoXact || + else if (metad->btm_oldest_btpo_xact != oldestXid || metad->btm_last_cleanup_num_heap_tuples != numHeapTuples) needsRewrite = true; @@ -214,7 +215,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, _bt_upgrademetapage(metapg); /* update cleanup-related information */ - metad->btm_oldest_btpo_xact = oldestBtpoXact; + metad->btm_oldest_btpo_xact = oldestXid; metad->btm_last_cleanup_num_heap_tuples = numHeapTuples; MarkBufferDirty(metabuf); @@ -232,7 +233,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, md.level = metad->btm_level; md.fastroot = metad->btm_fastroot; md.fastlevel = metad->btm_fastlevel; - md.oldest_btpo_xact = oldestBtpoXact; + md.oldest_btpo_xact = oldestXid; md.last_cleanup_num_heap_tuples = numHeapTuples; md.allequalimage = metad->btm_allequalimage; @@ -316,7 +317,7 @@ _bt_getroot(Relation rel, int access) * because that's not set in a "fast root". */ if (!P_IGNORE(rootopaque) && - rootopaque->btpo.level == rootlevel && + rootopaque->btpo_level == rootlevel && P_LEFTMOST(rootopaque) && P_RIGHTMOST(rootopaque)) { @@ -377,7 +378,7 @@ _bt_getroot(Relation rel, int access) rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); - rootopaque->btpo.level = 0; + rootopaque->btpo_level = 0; rootopaque->btpo_cycleid = 0; /* Get raw page pointer for metapage */ metapg = BufferGetPage(metabuf); @@ -481,11 +482,11 @@ _bt_getroot(Relation rel, int access) rootblkno = rootopaque->btpo_next; } - /* Note: can't check btpo.level on deleted pages */ - if (rootopaque->btpo.level != rootlevel) + /* Note: can't check btpo_level from !P_HAS_FULLXID() deleted page */ + if (rootopaque->btpo_level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), - rootopaque->btpo.level, rootlevel); + rootopaque->btpo_level, rootlevel); } /* @@ -585,11 +586,11 @@ _bt_gettrueroot(Relation rel) rootblkno = rootopaque->btpo_next; } - /* Note: can't check btpo.level on deleted pages */ - if (rootopaque->btpo.level != rootlevel) + /* Note: can't check btpo_level from !P_HAS_FULLXID() deleted page */ + if (rootopaque->btpo_level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), - rootopaque->btpo.level, rootlevel); + rootopaque->btpo_level, rootlevel); return rootbuf; } @@ -762,7 +763,8 @@ _bt_checkpage(Relation rel, Buffer buf) * Log the reuse of a page from the FSM. */ static void -_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid) +_bt_log_reuse_page(Relation rel, BlockNumber blkno, + FullTransactionId latestRemovedFullXid) { xl_btree_reuse_page xlrec_reuse; @@ -775,7 +777,7 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX /* XLOG stuff */ xlrec_reuse.node = rel->rd_node; xlrec_reuse.block = blkno; - xlrec_reuse.latestRemovedXid = latestRemovedXid; + xlrec_reuse.latestRemovedFullXid = latestRemovedFullXid; XLogBeginInsert(); XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage); @@ -862,17 +864,18 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) * If we are generating WAL for Hot Standby then create a * WAL record that will allow us to conflict with queries * running on standby, in case they have snapshots older - * than btpo.xact. This can only apply if the page does - * have a valid btpo.xact value, ie not if it's new. (We - * must check that because an all-zero page has no special - * space.) + * than safexid value returned by BTPageGetDeleteXid(). + * This can only apply if the page does have a valid + * safexid value, ie not if it's new. (We must check that + * because an all-zero page has no special space.) */ if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) && !PageIsNew(page)) { - BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + FullTransactionId latestRemovedFullXid; - _bt_log_reuse_page(rel, blkno, opaque->btpo.xact); + latestRemovedFullXid = BTPageGetDeleteXid(page); + _bt_log_reuse_page(rel, blkno, latestRemovedFullXid); } /* Okay to use page. Re-initialize and return it */ @@ -1101,9 +1104,31 @@ _bt_page_recyclable(Page page) * interested in it. */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (P_ISDELETED(opaque) && - GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact)) - return true; + if (P_ISDELETED(opaque)) + { + /* + * If this is a pg_upgrade'd index, then this could be a deleted page + * whose XID (which is stored in special area's level field via type + * punning) is non-full 32-bit value. It's safe to just assume that + * we can recycle because the system must have been restarted since + * the time of deletion. + */ + if (!P_HAS_FULLXID(opaque)) + return true; + + /* + * The page was deleted, but when? If it was just deleted, a scan + * might have seen the downlink to it, and will read the page later. + * As long as that can happen, we must keep the deleted page around as + * a tombstone. + * + * For that check if the deletion XID could still be visible to + * anyone. If not, then no scan that's still in progress could have + * seen its downlink, and we can recycle it. + */ + return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page)); + } + return false; } @@ -1768,16 +1793,17 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) * that the btvacuumscan scan has yet to reach; they'll get counted later * instead. * - * Maintains *oldestBtpoXact for any pages that get deleted. Caller is - * responsible for maintaining *oldestBtpoXact in the case of pages that were - * deleted by a previous VACUUM. + * Maintains *oldestSafeXid for any pages that get deleted. Caller is + * responsible for maintaining *oldestSafeXid in the case of pages that were + * deleted by a previous VACUUM but are nevertheless not yet safe to put in + * the FSM for recycling. * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ uint32 -_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) +_bt_pagedel(Relation rel, Buffer leafbuf, FullTransactionId *oldestSafeXid) { uint32 ndeleted = 0; BlockNumber rightsib; @@ -1985,7 +2011,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) { /* Check for interrupts in _bt_unlink_halfdead_page */ if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno, - &rightsib_empty, oldestBtpoXact, + &rightsib_empty, oldestSafeXid, &ndeleted)) { /* @@ -2001,9 +2027,10 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) } } - Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque)); - Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact, - *oldestBtpoXact)); + Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque) && + P_HAS_FULLXID(opaque)); + Assert(FullTransactionIdFollowsOrEquals(BTPageGetDeleteXid(page), + *oldestSafeXid)); rightsib = opaque->btpo_next; @@ -2264,11 +2291,10 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) * containing leafbuf. (We always set *rightsib_empty for caller, just to be * consistent.) * - * We maintain *oldestBtpoXact for pages that are deleted by the current - * VACUUM operation here. This must be handled here because we conservatively - * assume that there needs to be a new call to ReadNewTransactionId() each - * time a page gets deleted. See comments about the underlying assumption - * below. + * We maintain *oldestSafeXid for pages that are deleted by the current VACUUM + * operation here. This must be handled here because we conservatively assume + * that there needs to be a new call to ReadNextFullTransactionId() each time + * a page gets deleted. See comments about the underlying assumption below. * * Must hold pin and lock on leafbuf at entry (read or write doesn't matter). * On success exit, we'll be holding pin and write lock. On failure exit, @@ -2277,8 +2303,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) */ static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, - bool *rightsib_empty, TransactionId *oldestBtpoXact, - uint32 *ndeleted) + bool *rightsib_empty, + FullTransactionId *oldestSafeXid, uint32 *ndeleted) { BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); BlockNumber leafleftsib; @@ -2294,12 +2320,12 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, BTMetaPageData *metad = NULL; ItemId itemid; Page page; - PageHeader header; BTPageOpaque opaque; + FullTransactionId safexid; bool rightsib_is_rightmost; - int targetlevel; + uint32 targetlevel; IndexTuple leafhikey; - BlockNumber nextchild; + BlockNumber topparent_in_target; page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -2343,7 +2369,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); leftsib = opaque->btpo_prev; - targetlevel = opaque->btpo.level; + targetlevel = opaque->btpo_level; Assert(targetlevel > 0); /* @@ -2450,7 +2476,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque)) elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); - nextchild = InvalidBlockNumber; + + /* Leaf page is also target page: don't set topparent */ + topparent_in_target = InvalidBlockNumber; } else { @@ -2459,11 +2487,13 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); - /* Remember the next non-leaf child down in the subtree */ + /* Internal page is target: we'll set topparent in leaf page... */ itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); - nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid)); - if (nextchild == leafblkno) - nextchild = InvalidBlockNumber; + topparent_in_target = + BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid)); + /* ...except when it would be a redundant pointer-to-self */ + if (topparent_in_target == leafblkno) + topparent_in_target = InvalidBlockNumber; } /* @@ -2553,13 +2583,13 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * no lock was held. */ if (target != leafblkno) - BTreeTupleSetTopParent(leafhikey, nextchild); + BTreeTupleSetTopParent(leafhikey, topparent_in_target); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone. Storing GetTopTransactionId() would work, but * we're in VACUUM and would not otherwise have an XID. Having already - * updated links to the target, ReadNewTransactionId() suffices as an + * updated links to the target, ReadNextFullTransactionId() suffices as an * upper bound. Any scan having retained a now-stale link is advertising * in its PGPROC an xmin less than or equal to the value we read here. It * will continue to do so, holding back the xmin horizon, for the duration @@ -2568,17 +2598,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque)); - opaque->btpo_flags &= ~BTP_HALF_DEAD; - opaque->btpo_flags |= BTP_DELETED; - opaque->btpo.xact = ReadNewTransactionId(); /* - * Remove the remaining tuples on the page. This keeps things simple for - * WAL consistency checking. + * Store upper bound XID that's used to determine when deleted page is no + * longer needed as a tombstone */ - header = (PageHeader) page; - header->pd_lower = SizeOfPageHeaderData; - header->pd_upper = header->pd_special; + safexid = ReadNextFullTransactionId(); + BTPageSetDeleted(page, safexid); + opaque->btpo_cycleid = 0; /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) @@ -2616,15 +2643,16 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target != leafblkno) XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT); - /* information on the unlinked block */ + /* information stored on the target/to-be-unlinked block */ xlrec.leftsib = leftsib; xlrec.rightsib = rightsib; - xlrec.btpo_xact = opaque->btpo.xact; + xlrec.level = targetlevel; + xlrec.safexid = safexid; /* information needed to recreate the leaf block (if not the target) */ xlrec.leafleftsib = leafleftsib; xlrec.leafrightsib = leafrightsib; - xlrec.topparent = nextchild; + xlrec.topparent = topparent_in_target; XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage); @@ -2681,9 +2709,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, _bt_relbuf(rel, lbuf); _bt_relbuf(rel, rbuf); - if (!TransactionIdIsValid(*oldestBtpoXact) || - TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact)) - *oldestBtpoXact = opaque->btpo.xact; + /* If the target is not leafbuf, we're done with it now -- release it */ + if (target != leafblkno) + _bt_relbuf(rel, buf); + + /* Maintain oldestSafeXid for whole VACUUM */ + if (!FullTransactionIdIsValid(*oldestSafeXid) || + FullTransactionIdPrecedes(safexid, *oldestSafeXid)) + *oldestSafeXid = safexid; /* * If btvacuumscan won't revisit this page in a future btvacuumpage call @@ -2693,10 +2726,6 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target <= scanblkno) (*ndeleted)++; - /* If the target is not leafbuf, we're done with it now -- release it */ - if (target != leafblkno) - _bt_relbuf(rel, buf); - return true; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 289bd3c15d..27b41a4979 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -47,7 +47,7 @@ typedef struct void *callback_state; BTCycleId cycleid; BlockNumber totFreePages; /* true total # of free pages */ - TransactionId oldestBtpoXact; + FullTransactionId oldestSafeXid; MemoryContext pagedelcontext; } BTVacState; @@ -826,9 +826,9 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) GlobalVisCheckRemovableXid(NULL, metad->btm_oldest_btpo_xact)) { /* - * If any oldest btpo.xact from a previously deleted page in the index - * is visible to everyone, then at least one deleted page can be - * recycled -- don't skip cleanup. + * If the oldest safexid/btpo_xact from a previously deleted page in + * the index is visible to everyone, then at least one deleted page + * can be recycled -- don't skip cleanup. */ result = true; } @@ -989,7 +989,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.totFreePages = 0; - vstate.oldestBtpoXact = InvalidTransactionId; + vstate.oldestSafeXid = InvalidFullTransactionId; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, @@ -1066,18 +1066,19 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexFreeSpaceMapVacuum(rel); /* - * Maintain the oldest btpo.xact and a count of the current number of heap + * Maintain the oldest safexid and a count of the current number of heap * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup). * - * The page with the oldest btpo.xact is typically a page deleted by this + * The page with the oldest safexid is typically a page deleted by this * VACUUM operation, since pages deleted by a previous VACUUM operation * tend to be placed in the FSM (by the current VACUUM operation) -- such - * pages are not candidates to be the oldest btpo.xact. (Note that pages - * placed in the FSM are reported as deleted pages in the bulk delete - * statistics, despite not counting as deleted pages for the purposes of - * determining the oldest btpo.xact.) + * pages are not candidates to be the oldest safexid. + * + * Note that pages placed in the FSM are reported as deleted pages in the + * bulk delete statistics, despite not counting as deleted pages for the + * purposes of determining the oldest safexid. */ - _bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact, + _bt_update_meta_cleanup_info(rel, vstate.oldestSafeXid, info->num_heap_tuples); /* update statistics */ @@ -1198,22 +1199,32 @@ backtrack: } else if (P_ISDELETED(opaque)) { + FullTransactionId safexid; + /* * Already deleted page (which could be leaf or internal). Can't * recycle yet. */ stats->pages_deleted++; - /* Maintain the oldest btpo.xact */ - if (!TransactionIdIsValid(vstate->oldestBtpoXact) || - TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) - vstate->oldestBtpoXact = opaque->btpo.xact; + /* + * Maintain oldestSafeXid. We should only end up here with deleted + * pages that have the full transaction ID representation, since + * _bt_page_recyclable() always considers pg_upgrade'd deleted pages + * safe to recycle (the 32-bit XID must have been from before the + * upgrade). + */ + Assert(P_HAS_FULLXID(opaque)); + safexid = BTPageGetDeleteXid(page); + if (!FullTransactionIdIsValid(vstate->oldestSafeXid) || + FullTransactionIdPrecedes(safexid, vstate->oldestSafeXid)) + vstate->oldestSafeXid = safexid; } else if (P_ISHALFDEAD(opaque)) { /* * Half-dead leaf page. Try to delete now. Might update - * oldestBtpoXact and pages_deleted below. + * oldestSafeXid and pages_deleted below. */ attempt_pagedel = true; } @@ -1430,7 +1441,7 @@ backtrack: * count. There will be no double-counting. */ Assert(blkno == scanblkno); - stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact); + stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestSafeXid); MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 2e3bda8171..d1177d8772 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -169,7 +169,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, * we're on the level 1 and asked to lock leaf page in write mode, * then lock next page in write mode, because it must be a leaf. */ - if (opaque->btpo.level == 1 && access == BT_WRITE) + if (opaque->btpo_level == 1 && access == BT_WRITE) page_access = BT_WRITE; /* drop the read lock on the page, then acquire one on its child */ @@ -2341,9 +2341,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, } /* Done? */ - if (opaque->btpo.level == level) + if (opaque->btpo_level == level) break; - if (opaque->btpo.level < level) + if (opaque->btpo_level < level) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("btree level %u not found in index \"%s\"", diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 5683daa34d..2c4d7f6e25 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -620,7 +620,7 @@ _bt_blnewpage(uint32 level) /* Initialize BT opaque state */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_prev = opaque->btpo_next = P_NONE; - opaque->btpo.level = level; + opaque->btpo_level = level; opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; opaque->btpo_cycleid = 0; diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index c1d578cc01..b252d2e628 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -297,7 +297,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = spagenumber; - ropaque->btpo.level = xlrec->level; + ropaque->btpo_level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; @@ -773,7 +773,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) pageop->btpo_prev = xlrec->leftblk; pageop->btpo_next = xlrec->rightblk; - pageop->btpo.level = 0; + pageop->btpo_level = 0; pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_cycleid = 0; @@ -802,6 +802,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); BlockNumber leftsib; BlockNumber rightsib; + uint32 level; + bool isleaf; + FullTransactionId safexid; Buffer leftbuf; Buffer target; Buffer rightbuf; @@ -810,6 +813,12 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) leftsib = xlrec->leftsib; rightsib = xlrec->rightsib; + level = xlrec->level; + isleaf = (level == 0); + safexid = xlrec->safexid; + + /* No topparent link for leaf page (level 0) or level 1 */ + Assert(xlrec->topparent == InvalidBlockNumber || level > 1); /* * In normal operation, we would lock all the pages this WAL record @@ -844,9 +853,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) pageop->btpo_prev = leftsib; pageop->btpo_next = rightsib; - pageop->btpo.xact = xlrec->btpo_xact; - pageop->btpo_flags = BTP_DELETED; - if (!BlockNumberIsValid(xlrec->topparent)) + pageop->btpo_level = level; + BTPageSetDeleted(page, safexid); + if (isleaf) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; @@ -892,6 +901,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) Buffer leafbuf; IndexTupleData trunctuple; + Assert(!isleaf); + leafbuf = XLogInitBufferForRedo(record, 3); page = (Page) BufferGetPage(leafbuf); @@ -901,7 +912,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_prev = xlrec->leafleftsib; pageop->btpo_next = xlrec->leafrightsib; - pageop->btpo.level = 0; + pageop->btpo_level = 0; pageop->btpo_cycleid = 0; /* Add a dummy hikey item */ @@ -942,7 +953,7 @@ btree_xlog_newroot(XLogReaderState *record) pageop->btpo_flags = BTP_ROOT; pageop->btpo_prev = pageop->btpo_next = P_NONE; - pageop->btpo.level = xlrec->level; + pageop->btpo_level = xlrec->level; if (xlrec->level == 0) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; @@ -972,17 +983,15 @@ btree_xlog_reuse_page(XLogReaderState *record) * Btree reuse_page records exist to provide a conflict point when we * reuse pages in the index via the FSM. That's all they do though. * - * latestRemovedXid was the page's btpo.xact. The - * GlobalVisCheckRemovableXid test in _bt_page_recyclable() conceptually - * mirrors the pgxact->xmin > limitXmin test in + * latestRemovedXid was the page's deleteXid. The + * GlobalVisCheckRemovableFullXid(deleteXid) test in _bt_page_recyclable() + * conceptually mirrors the PGPROC->xmin > limitXmin test in * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the * same exclusion effect on primary and standby. */ if (InHotStandby) - { - ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, - xlrec->node); - } + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); } void diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 6e0d6a2b72..1a9bd36bc5 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -80,9 +80,10 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec; - appendStringInfo(buf, "left %u; right %u; btpo_xact %u; ", - xlrec->leftsib, xlrec->rightsib, - xlrec->btpo_xact); + appendStringInfo(buf, "left %u; right %u; level %u; safexid %u:%u; ", + xlrec->leftsib, xlrec->rightsib, xlrec->level, + EpochFromFullTransactionId(xlrec->safexid), + XidFromFullTransactionId(xlrec->safexid)); appendStringInfo(buf, "leafleft %u; leafright %u; topparent %u", xlrec->leafleftsib, xlrec->leafrightsib, xlrec->topparent); @@ -99,9 +100,11 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec; - appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u", + appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u", xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->latestRemovedXid); + xlrec->node.relNode, + EpochFromFullTransactionId(xlrec->latestRemovedFullXid), + XidFromFullTransactionId(xlrec->latestRemovedFullXid)); break; } case XLOG_BTREE_META_CLEANUP: diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 39a30c00f7..0eeb766943 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -452,6 +452,34 @@ ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode true); } +/* + * Variant of ResolveRecoveryConflictWithSnapshot that works with + * FullTransactionId values + */ +void +ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid, + RelFileNode node) +{ + /* + * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds, + * so truncate the logged FullTransactionId. If the logged value is very + * old, so that XID wrap-around already happened on it, there can't be any + * snapshots that still see it. + */ + FullTransactionId nextXid = ReadNextFullTransactionId(); + uint64 diff; + + diff = U64FromFullTransactionId(nextXid) - + U64FromFullTransactionId(latestRemovedFullXid); + if (diff < MaxTransactionId / 2) + { + TransactionId latestRemovedXid; + + latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node); + } +} + void ResolveRecoveryConflictWithTablespace(Oid tsid) { diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index b8c7793d9e..0032a2df67 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -769,7 +769,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(state->target, itemid); nextleveldown.leftmost = BTreeTupleGetDownLink(itup); - nextleveldown.level = opaque->btpo.level - 1; + nextleveldown.level = opaque->btpo_level - 1; } else { @@ -795,13 +795,13 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); /* Check level, which must be valid for non-ignorable page */ - if (level.level != opaque->btpo.level) + if (level.level != opaque->btpo_level) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", - current, level.level, opaque->btpo.level))); + current, level.level, opaque->btpo_level))); /* Verify invariants for page */ bt_target_page_check(state); @@ -1167,7 +1167,7 @@ bt_target_page_check(BtreeCheckState *state) bt_child_highkey_check(state, offset, NULL, - topaque->btpo.level); + topaque->btpo_level); } continue; } @@ -1529,7 +1529,7 @@ bt_target_page_check(BtreeCheckState *state) if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) { bt_child_highkey_check(state, InvalidOffsetNumber, - NULL, topaque->btpo.level); + NULL, topaque->btpo_level); } } @@ -1606,7 +1606,7 @@ bt_right_page_check_scankey(BtreeCheckState *state) ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg("level %u leftmost page of index \"%s\" was found deleted or half dead", - opaque->btpo.level, RelationGetRelationName(state->rel)), + opaque->btpo_level, RelationGetRelationName(state->rel)), errdetail_internal("Deleted page found when building scankey from right sibling."))); /* Be slightly more pro-active in freeing this memory, just in case */ @@ -1911,13 +1911,13 @@ bt_child_highkey_check(BtreeCheckState *state, (uint32) state->targetlsn))); /* Check level for non-ignorable page */ - if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1) + if (!P_IGNORE(opaque) && opaque->btpo_level != target_level - 1) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block found while following rightlinks from child of index \"%s\" has invalid level", RelationGetRelationName(state->rel)), errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", - blkno, target_level - 1, opaque->btpo.level))); + blkno, target_level - 1, opaque->btpo_level))); /* Try to detect circular links */ if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev) @@ -2145,7 +2145,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, * check for downlink connectivity. */ bt_child_highkey_check(state, downlinkoffnum, - child, topaque->btpo.level); + child, topaque->btpo_level); /* * Since there cannot be a concurrent VACUUM operation in readonly mode, @@ -2290,7 +2290,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, errmsg("harmless interrupted page split detected in index %s", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", - blkno, opaque->btpo.level, + blkno, opaque->btpo_level, opaque->btpo_prev, (uint32) (pagelsn >> 32), (uint32) pagelsn))); @@ -2321,7 +2321,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", RelationGetRelationName(state->rel)); - level = opaque->btpo.level; + level = opaque->btpo_level; itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(page, itemid); childblk = BTreeTupleGetDownLink(itup); @@ -2336,16 +2336,16 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, break; /* Do an extra sanity check in passing on internal pages */ - if (copaque->btpo.level != level - 1) + if (copaque->btpo_level != level - 1) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", blkno, childblk, - level - 1, copaque->btpo.level))); + level - 1, copaque->btpo_level))); - level = copaque->btpo.level; + level = copaque->btpo_level; itemid = PageGetItemIdCareful(state, childblk, child, P_FIRSTDATAKEY(copaque)); itup = (IndexTuple) PageGetItem(child, itemid); @@ -2407,7 +2407,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, errmsg("internal index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u page lsn=%X/%X.", - blkno, opaque->btpo.level, + blkno, opaque->btpo_level, (uint32) (pagelsn >> 32), (uint32) pagelsn))); } @@ -3002,21 +3002,26 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) } /* - * Deleted pages have no sane "level" field, so can only check non-deleted - * page level + * Deleted pages that still use the old 32-bit XID representation have no + * sane "level" field because they type pun the field, but all other pages + * (including pages deleted on Postgres 14+) have a valid value. */ - if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid leaf page level %u for block %u in index \"%s\"", - opaque->btpo.level, blocknum, RelationGetRelationName(state->rel)))); + if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) + { + /* Okay, we can trust btpo_level field from page */ - if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && - opaque->btpo.level == 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid internal page level 0 for block %u in index \"%s\"", - blocknum, RelationGetRelationName(state->rel)))); + if (P_ISLEAF(opaque) && opaque->btpo_level != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid leaf page level %u for block %u in index \"%s\"", + opaque->btpo_level, blocknum, RelationGetRelationName(state->rel)))); + + if (!P_ISLEAF(opaque) && opaque->btpo_level == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid internal page level 0 for block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)))); + } /* * Sanity checks for number of items on page. @@ -3064,7 +3069,9 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) * from version 9.4 on, so do the same here. See _bt_pagedel() for full * details. * - * Internal pages should never have garbage items, either. + * Also check that internal pages have no garbage items, and that no page + * has an invalid combination of page level flags relating to deleted + * pages. */ if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)) ereport(ERROR, @@ -3079,6 +3086,12 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) errmsg("internal page block %u in index \"%s\" has garbage items", blocknum, RelationGetRelationName(state->rel)))); + if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid page flag combination for block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)))); + return page; } diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 8bb180bbbe..bb81c699cd 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -75,11 +75,7 @@ typedef struct BTPageStat /* opaque data */ BlockNumber btpo_prev; BlockNumber btpo_next; - union - { - uint32 level; - TransactionId xact; - } btpo; + uint32 btpo_level; uint16 btpo_flags; BTCycleId btpo_cycleid; } BTPageStat; @@ -112,9 +108,33 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) /* page type (flags) */ if (P_ISDELETED(opaque)) { - stat->type = 'd'; - stat->btpo.xact = opaque->btpo.xact; - return; + /* We divide deleted pages into leaf ('d') or internal ('D') */ + if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque)) + stat->type = 'd'; + else + stat->type = 'D'; + + /* + * Report safexid in a deleted page. + * + * Handle pg_upgrade'd deleted pages that used the previous safexid + * representation in btpo_level field (this used to be a union type + * called "bpto"). + */ + if (P_HAS_FULLXID(opaque)) + { + FullTransactionId safexid = BTPageGetDeleteXid(page); + + elog(NOTICE, "deleted page from block %u has safexid %u:%u", + blkno, EpochFromFullTransactionId(safexid), + XidFromFullTransactionId(safexid)); + } + else + elog(NOTICE, "deleted page from block %u has safexid %u", + blkno, opaque->btpo_level); + + /* Don't interpret BTDeletedPageContents as index tuples */ + maxoff = InvalidOffsetNumber; } else if (P_IGNORE(opaque)) stat->type = 'e'; @@ -128,7 +148,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) /* btpage opaque data */ stat->btpo_prev = opaque->btpo_prev; stat->btpo_next = opaque->btpo_next; - stat->btpo.level = opaque->btpo.level; + stat->btpo_level = opaque->btpo_level; stat->btpo_flags = opaque->btpo_flags; stat->btpo_cycleid = opaque->btpo_cycleid; @@ -237,7 +257,8 @@ bt_page_stats_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) values[j++] = psprintf("%u", stat.free_size); values[j++] = psprintf("%u", stat.btpo_prev); values[j++] = psprintf("%u", stat.btpo_next); - values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level); + /* The "btpo" field now only stores btpo_level, never an xact */ + values[j++] = psprintf("%u", stat.btpo_level); values[j++] = psprintf("%d", stat.btpo_flags); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), @@ -503,10 +524,14 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page); - if (P_ISDELETED(opaque)) - elog(NOTICE, "page is deleted"); - - fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + if (!P_ISDELETED(opaque)) + fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + else + { + /* Don't interpret BTDeletedPageContents as index tuples */ + elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno); + fctx->max_calls = 0; + } uargs->leafpage = P_ISLEAF(opaque); uargs->rightmost = P_RIGHTMOST(opaque); @@ -603,7 +628,14 @@ bt_page_items_bytea(PG_FUNCTION_ARGS) if (P_ISDELETED(opaque)) elog(NOTICE, "page is deleted"); - fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + if (!P_ISDELETED(opaque)) + fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + else + { + /* Don't interpret BTDeletedPageContents as index tuples */ + elog(NOTICE, "page from block is deleted"); + fctx->max_calls = 0; + } uargs->leafpage = P_ISLEAF(opaque); uargs->rightmost = P_RIGHTMOST(opaque); diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c index b1ce0d77d7..5368bb30f0 100644 --- a/contrib/pgstattuple/pgstatindex.c +++ b/contrib/pgstattuple/pgstatindex.c @@ -283,8 +283,12 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo) page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - /* Determine page type, and update totals */ - + /* + * Determine page type, and update totals. + * + * Note that we arbitrarily bucket deleted pages together without + * considering if they're leaf pages or internal pages. + */ if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) -- 2.27.0