From c8aec56cfadcffac78a61568cf5903426c82b97d Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 9 Jun 2025 09:57:30 -0400 Subject: [PATCH v2] Fix issue with mark/restore nbtree pins. --- src/include/access/nbtree.h | 5 -- src/backend/access/nbtree/nbtree.c | 75 +++++++++++++++++---------- src/backend/access/nbtree/nbtsearch.c | 72 ++++++++++++++++++------- src/backend/access/nbtree/nbtutils.c | 25 +++++---- 4 files changed, 115 insertions(+), 62 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e709d2e0a..fe0b805d4 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1012,11 +1012,6 @@ typedef BTScanPosData *BTScanPos; ReleaseBuffer((scanpos).buf); \ (scanpos).buf = InvalidBuffer; \ } while (0) -#define BTScanPosUnpinIfPinned(scanpos) \ - do { \ - if (BTScanPosIsPinned(scanpos)) \ - BTScanPosUnpin(scanpos); \ - } while (0) #define BTScanPosIsValid(scanpos) \ ( \ diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 03a1d7b02..04dab0703 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -387,16 +387,6 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* we aren't holding any read locks, but gotta drop the pins */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - BTScanPosInvalidate(so->currPos); - } - /* * We prefer to eagerly drop leaf page pins before btgettuple returns. * This avoids making VACUUM wait to acquire a cleanup lock on the page. @@ -416,6 +406,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, * Pins cannot be held for more than an instant during bitmap scans either * way, so we might as well avoid wasting cycles on acquiring page LSNs. * + * Note: so->dropPin should never change across rescans. + * * See nbtree/README section on making concurrent TID recycling safe. */ so->dropPin = (!scan->xs_want_itup && @@ -423,12 +415,27 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, RelationNeedsWAL(scan->indexRelation) && scan->heapRelation != NULL); - so->markItemIndex = -1; + /* we aren't holding any read locks, but gotta drop the pins */ + if (BTScanPosIsValid(so->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + if (!so->dropPin) + BTScanPosUnpin(so->currPos); + BTScanPosInvalidate(so->currPos); + } + so->needPrimScan = false; so->scanBehind = false; so->oppositeDirCheck = false; - BTScanPosUnpinIfPinned(so->markPos); - BTScanPosInvalidate(so->markPos); + so->markItemIndex = -1; + if (BTScanPosIsValid(so->markPos)) + { + if (!so->dropPin) + BTScanPosUnpin(so->markPos); + BTScanPosInvalidate(so->markPos); + } /* * Allocate tuple workspace arrays, if needed for an index-only scan and @@ -475,11 +482,16 @@ btendscan(IndexScanDesc scan) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); + if (!so->dropPin) + BTScanPosUnpin(so->currPos); } so->markItemIndex = -1; - BTScanPosUnpinIfPinned(so->markPos); + if (BTScanPosIsValid(so->markPos)) + { + if (!so->dropPin) + BTScanPosUnpin(so->markPos); + } /* No need to invalidate positions, the RAM is about to be freed. */ @@ -505,8 +517,14 @@ btmarkpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* There may be an old mark with a pin (but no lock). */ - BTScanPosUnpinIfPinned(so->markPos); + /* Always invalidate any existing mark */ + if (BTScanPosIsValid(so->markPos)) + { + if (!so->dropPin) + BTScanPosUnpin(so->markPos); + BTScanPosInvalidate(so->markPos); + } + so->markItemIndex = -1; /* * Just record the current itemIndex. If we later step to next page @@ -516,11 +534,6 @@ btmarkpos(IndexScanDesc scan) */ if (BTScanPosIsValid(so->currPos)) so->markItemIndex = so->currPos.itemIndex; - else - { - BTScanPosInvalidate(so->markPos); - so->markItemIndex = -1; - } } /* @@ -555,14 +568,13 @@ btrestrpos(IndexScanDesc scan) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); + if (!so->dropPin) + BTScanPosUnpin(so->currPos); + BTScanPosInvalidate(so->currPos); } if (BTScanPosIsValid(so->markPos)) { - /* bump pin on mark buffer for assignment to current buffer */ - if (BTScanPosIsPinned(so->markPos)) - IncrBufferRefCount(so->markPos.buf); memcpy(&so->currPos, &so->markPos, offsetof(BTScanPosData, items[1]) + so->markPos.lastItem * sizeof(BTScanPosItem)); @@ -575,9 +587,16 @@ btrestrpos(IndexScanDesc scan) _bt_start_array_keys(scan, so->currPos.dir); so->needPrimScan = false; } + + /* + * We need to keep the mark around, in case we need to restore it + * again. But we don't need to use the so->markPos representation + * (the so->markItemIndex representation suffices, now that the + * page that so->currPos references is this mark's page). + */ + so->markItemIndex = so->currPos.itemIndex; + BTScanPosInvalidate(so->markPos); } - else - BTScanPosInvalidate(so->currPos); } } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 070f14c8b..d443b1abd 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -2109,6 +2109,7 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) * _bt_steppage() -- Step to next page containing valid data for scan * * Wrapper on _bt_readnextpage that performs final steps for the current page. + * Only called when _bt_drop_lock_and_maybe_pin was called for so->currPos. * * On entry, so->currPos must be valid. Its buffer will be pinned, though * never locked. (Actually, when so->dropPin there won't even be a pin held, @@ -2122,6 +2123,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) lastcurrblkno; Assert(BTScanPosIsValid(so->currPos)); + if (!so->dropPin) + Assert(BTScanPosIsPinned(so->currPos)); + else + Assert(!BTScanPosIsPinned(so->currPos)); /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) @@ -2133,9 +2138,6 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) */ if (so->markItemIndex >= 0) { - /* bump pin on current buffer for assignment to mark buffer */ - if (BTScanPosIsPinned(so->currPos)) - IncrBufferRefCount(so->currPos.buf); memcpy(&so->markPos, &so->currPos, offsetof(BTScanPosData, items[1]) + so->currPos.lastItem * sizeof(BTScanPosItem)); @@ -2145,6 +2147,16 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->markPos.itemIndex = so->markItemIndex; so->markItemIndex = -1; + /* + * If there's a pin held on so->currPos, we transfer the pin over to + * so->markPos (without actually releasing it) + */ + Assert(BTScanPosIsValid(so->markPos)); + if (!so->dropPin) + Assert(BTScanPosIsPinned(so->markPos)); + else + Assert(!BTScanPosIsPinned(so->markPos)); + /* * If we're just about to start the next primitive index scan * (possible with a scan that has arrays keys, and needs to skip to @@ -2172,15 +2184,14 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) /* mark/restore not supported by parallel scans */ Assert(!scan->parallel_scan); } - - BTScanPosUnpinIfPinned(so->currPos); - - /* Walk to the next page with data */ - if (ScanDirectionIsForward(dir)) - blkno = so->currPos.nextPage; - else - blkno = so->currPos.prevPage; - lastcurrblkno = so->currPos.currPage; + else if (!so->dropPin) + { + /* + * Not saving so->currPos position in so->markPos, so release the + * position's pin for ourselves + */ + BTScanPosUnpin(so->currPos); + } /* * Cancel primitive index scans that were scheduled when the call to @@ -2191,6 +2202,19 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) if (so->currPos.dir != dir) so->needPrimScan = false; + /* + * so->currPos no longer holds a buffer pin (independent of whether we + * released the pin, or just transferred it over to so->markPos) + */ + so->currPos.buf = InvalidBuffer; + + /* Walk to the next page with data */ + if (ScanDirectionIsForward(dir)) + blkno = so->currPos.nextPage; + else + blkno = so->currPos.prevPage; + lastcurrblkno = so->currPos.currPage; + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false); } @@ -2220,7 +2244,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; + BlockNumber blkno, + lastcurrblkno; so->numKilled = 0; /* just paranoia */ so->markItemIndex = -1; /* ditto */ @@ -2253,8 +2280,6 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) */ if (_bt_readpage(scan, dir, offnum, true)) { - Relation rel = scan->indexRelation; - /* * _bt_readpage succeeded. Drop the lock (and maybe the pin) on * so->currPos.buf in preparation for btgettuple returning tuples. @@ -2265,14 +2290,20 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) } /* There's no actually-matching data on the page in so->currPos.buf */ - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + Assert(so->numKilled == 0); + Assert(so->markItemIndex < 0); - /* Call _bt_readnextpage using its _bt_steppage wrapper function */ - if (!_bt_steppage(scan, dir)) - return false; + _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; - /* _bt_readpage for a later page (now in so->currPos) succeeded */ - return true; + /* Walk to the next page with data */ + if (ScanDirectionIsForward(dir)) + blkno = so->currPos.nextPage; + else + blkno = so->currPos.prevPage; + lastcurrblkno = so->currPos.currPage; + + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false); } /* @@ -2408,6 +2439,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, /* no matching tuples on this page */ _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; seized = false; /* released by _bt_readpage (or by us) */ } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 29f0dca1b..731916765 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -3323,9 +3323,9 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, * current page and killed tuples thereon (generally, this should only be * called if so->numKilled > 0). * - * The caller does not have a lock on the page and may or may not have the - * page pinned in a buffer. Note that read-lock is sufficient for setting - * LP_DEAD status (which is only a hint). + * Caller should not have a lock on the so->currPos page, but must hold a + * buffer pin when !so->dropPin. When we return, it still won't be locked. + * It'll continue to hold the same pins that were held when we were called. * * We match items by heap TID before assuming they are the right ones to * delete. We cope with cases where items have moved right due to insertions. @@ -3353,6 +3353,7 @@ _bt_killitems(IndexScanDesc scan) OffsetNumber maxoff; int numKilled = so->numKilled; bool killedsomething = false; + Buffer buf; Assert(numKilled > 0); Assert(BTScanPosIsValid(so->currPos)); @@ -3369,11 +3370,11 @@ _bt_killitems(IndexScanDesc scan) * concurrent VACUUMs from recycling any of the TIDs on the page. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_lockbuf(rel, so->currPos.buf, BT_READ); + buf = so->currPos.buf; + _bt_lockbuf(rel, buf, BT_READ); } else { - Buffer buf; XLogRecPtr latestlsn; Assert(!BTScanPosIsPinned(so->currPos)); @@ -3391,10 +3392,9 @@ _bt_killitems(IndexScanDesc scan) } /* Unmodified, hinting is safe */ - so->currPos.buf = buf; } - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@ -3511,10 +3511,17 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(so->currPos.buf, true); + MarkBufferDirtyHint(buf, true); } - _bt_unlockbuf(rel, so->currPos.buf); + /* + * so->currPos retains no locks, but does retain whatever pins were held + * just before we were called + */ + if (!so->dropPin) + _bt_unlockbuf(rel, buf); + else + _bt_relbuf(rel, buf); } -- 2.49.0