From 86c815f96d3bb22a8f65486ef85bb3fbfd87b711 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sun, 18 Jan 2026 11:14:36 -0500 Subject: [PATCH v12 05/23] Use fake LSNs to improve nbtree dropPin behavior. Previously unlogged nbtree indexes needed to hold on to a leaf page buffer pin when stopped on that leaf page, purely so that the _bt_killitems process had a way to be sure that there wasn't any unsafe concurrent TID recycling by VACUUM. The _bt_killitems' dropPin strategy couldn't be used before now, since it works by checking if the page LSN has changed in the period after _bt_readpage read the page's items, but before _bt_killitems was called. We now use the same LSN trick with unlogged indexes, bringing the same benefits to these scans that commit 2ed5b87f brought to scans of logged relations. This is preparation for an upcoming commit that will add the amgetbatch interface and switch nbtree over to it (from amgettuple). That will go further by completely obviating the need for amgetbatch scans to hang on to buffer pins (barring scans involving a non-MVCC snapshot). Author: Peter Geoghegan Discussion: https://postgr.es/m/CAH2-WzkehuhxyuA8quc7rRN3EtNXpiKsjPfO8mhb+0Dr2K0Dtg@mail.gmail.com --- src/backend/access/nbtree/README | 5 +- src/backend/access/nbtree/nbtdedup.c | 8 ++- src/backend/access/nbtree/nbtinsert.c | 48 +++++++++------- src/backend/access/nbtree/nbtpage.c | 82 +++++++++++++++------------ src/backend/access/nbtree/nbtree.c | 8 --- src/backend/access/nbtree/nbtsearch.c | 1 - src/backend/access/nbtree/nbtutils.c | 1 - 7 files changed, 80 insertions(+), 73 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 53d4a61dc..cb921ca2e 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -485,9 +485,8 @@ We handle this kill_prior_tuple race condition by having affected index scans conservatively assume that any change to the leaf page at all implies that it was reached by btbulkdelete in the interim period when no buffer pin was held. This is implemented by not setting any LP_DEAD bits -on the leaf page at all when the page's LSN has changed. (That won't work -with an unlogged index, so for now we don't ever apply the "don't hold -onto pin" optimization there.) +on the leaf page at all when the page's LSN has changed. (This is why we +implement "fake" LSNs for unlogged index relations.) Fastpath For Index Insertion ---------------------------- diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index 95be0b179..af7affdf4 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -69,6 +69,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0; bool singlevalstrat = false; int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + XLogRecPtr recptr; /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); @@ -245,7 +246,6 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, /* XLOG stuff */ if (RelationNeedsWAL(rel)) { - XLogRecPtr recptr; xl_btree_dedup xlrec_dedup; xlrec_dedup.nintervals = state->nintervals; @@ -263,9 +263,11 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, state->nintervals * sizeof(BTDedupInterval)); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP); - - PageSetLSN(page, recptr); } + else + recptr = XLogGetFakeLSN(rel); + + PageSetLSN(page, recptr); END_CRIT_SECTION(); diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 65dfb8dd4..a9fc07db4 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1126,6 +1126,7 @@ _bt_insertonpg(Relation rel, IndexTuple oposting = NULL; IndexTuple origitup = NULL; IndexTuple nposting = NULL; + XLogRecPtr recptr; page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); @@ -1323,7 +1324,6 @@ _bt_insertonpg(Relation rel, xl_btree_insert xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; - XLogRecPtr recptr; uint16 upostingoff; xlrec.offnum = newitemoff; @@ -1396,14 +1396,16 @@ _bt_insertonpg(Relation rel, } recptr = XLogInsert(RM_BTREE_ID, xlinfo); - - if (BufferIsValid(metabuf)) - PageSetLSN(metapg, recptr); - if (!isleaf) - PageSetLSN(BufferGetPage(cbuf), recptr); - - PageSetLSN(page, recptr); } + else + recptr = XLogGetFakeLSN(rel); + + if (BufferIsValid(metabuf)) + PageSetLSN(metapg, recptr); + if (!isleaf) + PageSetLSN(BufferGetPage(cbuf), recptr); + + PageSetLSN(page, recptr); END_CRIT_SECTION(); @@ -1505,6 +1507,7 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, bool newitemonleft, isleaf, isrightmost; + XLogRecPtr recptr; /* * origpage is the original page to be split. leftpage is a temporary @@ -1984,7 +1987,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, { xl_btree_split xlrec; uint8 xlinfo; - XLogRecPtr recptr; xlrec.level = ropaque->btpo_level; /* See comments below on newitem, orignewitem, and posting lists */ @@ -2068,14 +2070,16 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; recptr = XLogInsert(RM_BTREE_ID, xlinfo); - - PageSetLSN(origpage, recptr); - PageSetLSN(rightpage, recptr); - if (!isrightmost) - PageSetLSN(spage, recptr); - if (!isleaf) - PageSetLSN(BufferGetPage(cbuf), recptr); } + else + recptr = XLogGetFakeLSN(rel); + + PageSetLSN(origpage, recptr); + PageSetLSN(rightpage, recptr); + if (!isrightmost) + PageSetLSN(spage, recptr); + if (!isleaf) + PageSetLSN(BufferGetPage(cbuf), recptr); END_CRIT_SECTION(); @@ -2493,6 +2497,7 @@ _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) Buffer metabuf; Page metapg; BTMetaPageData *metad; + XLogRecPtr recptr; lbkno = BufferGetBlockNumber(lbuf); rbkno = BufferGetBlockNumber(rbuf); @@ -2588,7 +2593,6 @@ _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) if (RelationNeedsWAL(rel)) { xl_btree_newroot xlrec; - XLogRecPtr recptr; xl_btree_metadata md; xlrec.rootblk = rootblknum; @@ -2622,11 +2626,13 @@ _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) ((PageHeader) rootpage)->pd_upper); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); - - PageSetLSN(lpage, recptr); - PageSetLSN(rootpage, recptr); - PageSetLSN(metapg, recptr); } + else + recptr = XLogGetFakeLSN(rel); + + PageSetLSN(lpage, recptr); + PageSetLSN(rootpage, recptr); + PageSetLSN(metapg, recptr); END_CRIT_SECTION(); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 9aa78068a..cc9c45dc4 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -235,6 +235,7 @@ _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages) Buffer metabuf; Page metapg; BTMetaPageData *metad; + XLogRecPtr recptr; /* * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage @@ -286,7 +287,6 @@ _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages) if (RelationNeedsWAL(rel)) { xl_btree_metadata md; - XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); @@ -303,9 +303,11 @@ _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages) XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata)); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP); - - PageSetLSN(metapg, recptr); } + else + recptr = XLogGetFakeLSN(rel); + + PageSetLSN(metapg, recptr); END_CRIT_SECTION(); @@ -351,6 +353,7 @@ _bt_getroot(Relation rel, Relation heaprel, int access) BlockNumber rootblkno; uint32 rootlevel; BTMetaPageData *metad; + XLogRecPtr recptr; Assert(access == BT_READ || heaprel != NULL); @@ -473,7 +476,6 @@ _bt_getroot(Relation rel, Relation heaprel, int access) if (RelationNeedsWAL(rel)) { xl_btree_newroot xlrec; - XLogRecPtr recptr; xl_btree_metadata md; XLogBeginInsert(); @@ -497,10 +499,12 @@ _bt_getroot(Relation rel, Relation heaprel, int access) XLogRegisterData(&xlrec, SizeOfBtreeNewroot); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); - - PageSetLSN(rootpage, recptr); - PageSetLSN(metapg, recptr); } + else + recptr = XLogGetFakeLSN(rel); + + PageSetLSN(rootpage, recptr); + PageSetLSN(metapg, recptr); END_CRIT_SECTION(); @@ -1162,6 +1166,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, char *updatedbuf = NULL; Size updatedbuflen = 0; OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; + XLogRecPtr recptr; /* Shouldn't be called unless there's something to do */ Assert(ndeletable > 0 || nupdatable > 0); @@ -1226,7 +1231,6 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, /* XLOG stuff */ if (needswal) { - XLogRecPtr recptr; xl_btree_vacuum xlrec_vacuum; xlrec_vacuum.ndeleted = ndeletable; @@ -1248,9 +1252,11 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, } recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); - - PageSetLSN(page, recptr); } + else + recptr = XLogGetFakeLSN(rel); + + PageSetLSN(page, recptr); END_CRIT_SECTION(); @@ -1292,6 +1298,7 @@ _bt_delitems_delete(Relation rel, Buffer buf, char *updatedbuf = NULL; Size updatedbuflen = 0; OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; + XLogRecPtr recptr; /* Shouldn't be called unless there's something to do */ Assert(ndeletable > 0 || nupdatable > 0); @@ -1342,7 +1349,6 @@ _bt_delitems_delete(Relation rel, Buffer buf, /* XLOG stuff */ if (needswal) { - XLogRecPtr recptr; xl_btree_delete xlrec_delete; xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon; @@ -1366,9 +1372,11 @@ _bt_delitems_delete(Relation rel, Buffer buf, } recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE); - - PageSetLSN(page, recptr); } + else + recptr = XLogGetFakeLSN(rel); + + PageSetLSN(page, recptr); END_CRIT_SECTION(); @@ -2103,6 +2111,7 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, OffsetNumber nextoffset; IndexTuple itup; IndexTupleData trunctuple; + XLogRecPtr recptr; page = BufferGetPage(leafbuf); opaque = BTPageGetOpaque(page); @@ -2253,7 +2262,6 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, if (RelationNeedsWAL(rel)) { xl_btree_mark_page_halfdead xlrec; - XLogRecPtr recptr; xlrec.poffset = poffset; xlrec.leafblk = leafblkno; @@ -2274,12 +2282,14 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, XLogRegisterData(&xlrec, SizeOfBtreeMarkPageHalfDead); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD); - - page = BufferGetPage(subtreeparent); - PageSetLSN(page, recptr); - page = BufferGetPage(leafbuf); - PageSetLSN(page, recptr); } + else + recptr = XLogGetFakeLSN(rel); + + page = BufferGetPage(subtreeparent); + PageSetLSN(page, recptr); + page = BufferGetPage(leafbuf); + PageSetLSN(page, recptr); END_CRIT_SECTION(); @@ -2337,6 +2347,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, uint32 targetlevel; IndexTuple leafhikey; BlockNumber leaftopparent; + XLogRecPtr recptr; page = BufferGetPage(leafbuf); opaque = BTPageGetOpaque(page); @@ -2676,7 +2687,6 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, xl_btree_unlink_page xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; - XLogRecPtr recptr; XLogBeginInsert(); @@ -2720,25 +2730,25 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, xlinfo = XLOG_BTREE_UNLINK_PAGE; recptr = XLogInsert(RM_BTREE_ID, xlinfo); + } + else + recptr = XLogGetFakeLSN(rel); - if (BufferIsValid(metabuf)) - { - PageSetLSN(metapg, recptr); - } - page = BufferGetPage(rbuf); + if (BufferIsValid(metabuf)) + PageSetLSN(metapg, recptr); + page = BufferGetPage(rbuf); + PageSetLSN(page, recptr); + page = BufferGetPage(buf); + PageSetLSN(page, recptr); + if (BufferIsValid(lbuf)) + { + page = BufferGetPage(lbuf); PageSetLSN(page, recptr); - page = BufferGetPage(buf); + } + if (target != leafblkno) + { + page = BufferGetPage(leafbuf); PageSetLSN(page, recptr); - if (BufferIsValid(lbuf)) - { - page = BufferGetPage(lbuf); - PageSetLSN(page, recptr); - } - if (target != leafblkno) - { - page = BufferGetPage(leafbuf); - PageSetLSN(page, recptr); - } } END_CRIT_SECTION(); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 6d0a6f27f..0da48b42a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -407,13 +407,6 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, * race condition involving VACUUM setting pages all-visible in the VM. * It's also unsafe for plain index scans that use a non-MVCC snapshot. * - * When we drop pins eagerly, the mechanism that marks so->killedItems[] - * index tuples LP_DEAD has to deal with concurrent TID recycling races. - * The scheme used to detect unsafe TID recycling won't work when scanning - * unlogged relations (since it involves saving an affected page's LSN). - * Opt out of eager pin dropping during unlogged relation scans for now - * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting). - * * Also opt out of dropping leaf page pins eagerly during bitmap scans. * Pins cannot be held for more than an instant during bitmap scans either * way, so we might as well avoid wasting cycles on acquiring page LSNs. @@ -424,7 +417,6 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, */ so->dropPin = (!scan->xs_want_itup && IsMVCCSnapshot(scan->xs_snapshot) && - RelationNeedsWAL(scan->indexRelation) && scan->heapRelation != NULL); so->markItemIndex = -1; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index bfaaf1f01..9be374e6d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -67,7 +67,6 @@ _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so) * Have to set so->currPos.lsn so that _bt_killitems has a way to detect * when concurrent heap TID recycling by VACUUM might have taken place. */ - Assert(RelationNeedsWAL(rel)); so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); _bt_relbuf(rel, so->currPos.buf); so->currPos.buf = InvalidBuffer; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 1b9f2aa10..7a46d0249 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -236,7 +236,6 @@ _bt_killitems(IndexScanDesc scan) XLogRecPtr latestlsn; Assert(!BTScanPosIsPinned(so->currPos)); - Assert(RelationNeedsWAL(rel)); buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); latestlsn = BufferGetLSNAtomic(buf); -- 2.53.0