From 0474c4028e644fa99fa4dc1cf2479220782e7a30 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 20 Mar 2025 23:12:25 +0100 Subject: [PATCH v10 4/5] NBTree: Reduce Index-Only Scan pinning requirements Previously, we would keep a pin on every leaf page while we were returning tuples to the scan. With this patch, we utilize the newly introduced table_index_vischeck_tuples API to pre-check visibility of all TIDs, and thus unpin the page well ahead of when we'd usually be ready with returning and processing all index tuple results. This reduces the time VACUUM may have to wait for a pin, and can increase performance with reduced redundant VM checks. --- src/include/access/nbtree.h | 4 ++ src/backend/access/nbtree/nbtree.c | 5 ++ src/backend/access/nbtree/nbtsearch.c | 100 ++++++++++++++++++++++++-- 3 files changed, 103 insertions(+), 6 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 0c43767f8c3..2423ddf7bfd 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -951,6 +951,7 @@ typedef struct BTScanPosItem /* what we remember about each match */ ItemPointerData heapTid; /* TID of referenced heap item */ OffsetNumber indexOffset; /* index item's location within page */ LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ + uint8 visrecheck; /* visibility recheck status, if any */ } BTScanPosItem; typedef struct BTScanPosData @@ -1053,6 +1054,9 @@ typedef struct BTScanOpaqueData int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ + /* buffer used for index-only scan visibility checks */ + Buffer vmbuf; + /* * If we are doing an index-only scan, these are the tuple storage * workspaces for the currPos and markPos respectively. Each is of size diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index c0a8833e068..e7b7f7cfa4c 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -345,6 +345,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + so->vmbuf = InvalidBuffer; + /* * We don't know yet whether the scan will be index-only, so we do not * allocate the tuple workspace arrays until btrescan. However, we set up @@ -436,6 +438,9 @@ btendscan(IndexScanDesc scan) so->markItemIndex = -1; BTScanPosUnpinIfPinned(so->markPos); + if (BufferIsValid(so->vmbuf)) + ReleaseBuffer(so->vmbuf); + /* No need to invalidate positions, the RAM is about to be freed. */ /* Release storage */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 22b27d01d00..7445ee9daeb 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,7 +25,7 @@ #include "utils/rel.h" -static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, Buffer *vmbuf); static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access); @@ -64,13 +64,88 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); * See nbtree/README section on making concurrent TID recycling safe. */ static void -_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, Buffer *vmbuf) { _bt_unlockbuf(scan->indexRelation, sp->buf); + /* + * Do some visibility checks if this is an index-only scan; allowing us to + * drop the pin on this page before we have returned all tuples from this + * IOS to the executor. + */ + if (scan->xs_want_itup) + { + TM_IndexVisibilityCheckOp visCheck; + int offset = sp->firstItem; + + visCheck.nchecktids = 1 + sp->lastItem - offset; + visCheck.checktids = palloc_array(TM_VisCheck, + visCheck.nchecktids); + visCheck.vmbuf = vmbuf; + + for (int i = 0; i < visCheck.nchecktids; i++) + { + int itemidx = offset + i; + + Assert(sp->items[itemidx].visrecheck == TMVC_Unchecked); + Assert(ItemPointerIsValid(&sp->items[itemidx].heapTid)); + + visCheck.checktids[i].tid = sp->items[itemidx].heapTid; + visCheck.checktids[i].idxoffnum = itemidx; + visCheck.checktids[i].vischeckresult = TMVC_Unchecked; + } + + table_index_vischeck_tuples(scan->heapRelation, &visCheck); + + for (int i = 0; i < visCheck.nchecktids; i++) + { + TM_VisCheck *check = &visCheck.checktids[i]; + BTScanPosItem *item = &sp->items[check->idxoffnum]; + + /* We must have a valid visibility check result */ + Assert(check->vischeckresult != TMVC_Unchecked); + /* The offset number should still indicate the right item */ + Assert(ItemPointerEquals(&check->tid, &item->heapTid)); + + /* Store the visibility check result */ + item->visrecheck = check->vischeckresult; + } + + /* release temporary resources */ + pfree(visCheck.checktids); + } + + /* + * We may need to hold a pin on the page for one of several reasons: + * + * 1.) To safely apply kill_prior_tuple, we need to know that the tuples + * were not removed from the page (and subsequently re-inserted). + * A page's LSN can also allow us to detect modifications on the page, + * which then allows us to bail out of setting the hint bits, but that + * requires the index to be WAL-logged; so unless the index is WAL-logged + * we must hold a pin on the page to apply the kill_prior_tuple + * optimization. + * + * 2.) Non-MVCC scans need pin coupling to make sure the scan covers + * exactly the whole index keyspace. + * + * 3.) For Index-Only Scans, the scan needs to check the visibility of the + * table tuple while the relevant index tuple is guaranteed to still be + * contained in the index (so that vacuum hasn't yet marked any pages that + * could contain the value as ALL_VISIBLE after reclaiming a dead tuple + * that might be buffered in the scan). A pin must therefore be held + * at least while the basic visibility of the page's tuples is being + * checked. + * + * For cases 1 and 2, we must hold the pin after we've finished processing + * the index page. + * + * For case 3, we can release the pin if we first do the visibility checks + * of to-be-returned tuples using table_index_vischeck_tuples, which we've + * done just above. + */ if (IsMVCCSnapshot(scan->xs_snapshot) && - RelationNeedsWAL(scan->indexRelation) && - !scan->xs_want_itup) + RelationNeedsWAL(scan->indexRelation)) { ReleaseBuffer(sp->buf); sp->buf = InvalidBuffer; @@ -1904,6 +1979,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; + currItem->visrecheck = TMVC_Unchecked; + if (so->currTuples) { Size itupsz = IndexTupleSize(itup); @@ -1934,6 +2011,8 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, currItem->heapTid = *heapTid; currItem->indexOffset = offnum; + currItem->visrecheck = TMVC_Unchecked; + if (so->currTuples) { /* Save base IndexTuple (truncate posting list) */ @@ -1970,6 +2049,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, currItem->heapTid = *heapTid; currItem->indexOffset = offnum; + currItem->visrecheck = TMVC_Unchecked; /* * Have index-only scans return the same base IndexTuple for every TID @@ -1995,6 +2075,14 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) /* Return next item, per amgettuple contract */ scan->xs_heaptid = currItem->heapTid; + + if (scan->xs_want_itup) + { + scan->xs_visrecheck = currItem->visrecheck; + Assert(currItem->visrecheck != TMVC_Unchecked || + BufferIsValid(so->currPos.buf)); + } + if (so->currTuples) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); } @@ -2153,7 +2241,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * so->currPos.buf in preparation for btgettuple returning tuples. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(scan, &so->currPos, &so->vmbuf); return true; } @@ -2310,7 +2398,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, */ Assert(so->currPos.currPage == blkno); Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(scan, &so->currPos, &so->vmbuf); return true; } -- 2.45.2