From 6b495c6808f3bdfd13a5ac34951850643edc4456 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 25 Nov 2025 18:03:15 -0500 Subject: [PATCH v1 4/4] Add amgetbatch support to hash index AM. This patch should be considered a work in progress. It has only been lightly tested, and it's not clear if I've accounted for all of the intricacies with bucket splits (and with pins that are held by the scan's opaque state more generally). This automatically switched hash indexes over to using the dropPin optimization, since that is standard when using the new amgetbatch interface. This won't bring similar benefits to hash index scans that nbtree index scans gained when commit 2ed5b87f went in. Hash index vacuuming acquires a cleanup lock on bucket pages, but conflicting pins on bucket pages are still held for the full duration of each index scan. However, there is still independent value in avoiding holding on to buffer pins during index scans: index prefetching tends to hold open as many as several dozen batches with certain workloads (workloads where the stream position has to get quite far ahead of the read position in order to maintain the appropriate prefetch distance on the heapam side). Guaranteeing that open batches won't hold buffer pins on index pages (at least during plain index scans that use an MVCC snapshot, per the rules established by commit 2ed5b87f) is likely to make life easier when the resource management rules for I/O prefetching are fully ironed-out. Note that the code path in _hash_kill_items that calls _hash_getbuf (though only with the LH_OVERFLOW_PAGE flag/overflow pages) is dead code/vestigial on master [1]. However, it's now useful again, since _hash_kill_items might now need to relock and repin an overflow page (not just relock it) during dropPin scans. [1] https://postgr.es/m/CAH2-Wz=8mefy8QUcsnKLTePuy4tE8pdO+gSRQ9yQwUHoaeFTFw@mail.gmail.com Author: Peter Geoghegan --- src/include/access/hash.h | 73 +------ src/backend/access/hash/hash.c | 126 +++++------- src/backend/access/hash/hashpage.c | 19 +- src/backend/access/hash/hashsearch.c | 282 ++++++++++++--------------- src/backend/access/hash/hashutil.c | 98 +++++----- src/tools/pgindent/typedefs.list | 2 - 6 files changed, 240 insertions(+), 360 deletions(-) diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 839c34312..03f8bc9b3 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -100,58 +100,6 @@ typedef HashPageOpaqueData *HashPageOpaque; */ #define HASHO_PAGE_ID 0xFF80 -typedef struct HashScanPosItem /* what we remember about each match */ -{ - ItemPointerData heapTid; /* TID of referenced heap item */ - OffsetNumber indexOffset; /* index item's location within page */ -} HashScanPosItem; - -typedef struct HashScanPosData -{ - Buffer buf; /* if valid, the buffer is pinned */ - BlockNumber currPage; /* current hash index page */ - BlockNumber nextPage; /* next overflow page */ - BlockNumber prevPage; /* prev overflow or bucket page */ - - /* - * The items array is always ordered in index order (ie, increasing - * indexoffset). When scanning backwards it is convenient to fill the - * array back-to-front, so we start at the last slot and fill downwards. - * Hence we need both a first-valid-entry and a last-valid-entry counter. - * itemIndex is a cursor showing which entry was last returned to caller. - */ - int firstItem; /* first valid index in items[] */ - int lastItem; /* last valid index in items[] */ - int itemIndex; /* current index in items[] */ - - HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ -} HashScanPosData; - -#define HashScanPosIsPinned(scanpos) \ -( \ - AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ - !BufferIsValid((scanpos).buf)), \ - BufferIsValid((scanpos).buf) \ -) - -#define HashScanPosIsValid(scanpos) \ -( \ - AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ - !BufferIsValid((scanpos).buf)), \ - BlockNumberIsValid((scanpos).currPage) \ -) - -#define HashScanPosInvalidate(scanpos) \ - do { \ - (scanpos).buf = InvalidBuffer; \ - (scanpos).currPage = InvalidBlockNumber; \ - (scanpos).nextPage = InvalidBlockNumber; \ - (scanpos).prevPage = InvalidBlockNumber; \ - (scanpos).firstItem = 0; \ - (scanpos).lastItem = 0; \ - (scanpos).itemIndex = 0; \ - } while (0) - /* * HashScanOpaqueData is private state for a hash index scan. */ @@ -178,15 +126,6 @@ typedef struct HashScanOpaqueData * referred only when hashso_buc_populated is true. */ bool hashso_buc_split; - /* info about killed items if any (killedItems is NULL if never used) */ - int *killedItems; /* currPos.items indexes of killed items */ - int numKilled; /* number of currently stored items */ - - /* - * Identify all the matching items on a page and save them in - * HashScanPosData - */ - HashScanPosData currPos; /* current position data */ } HashScanOpaqueData; typedef HashScanOpaqueData *HashScanOpaque; @@ -368,11 +307,14 @@ extern bool hashinsert(Relation rel, Datum *values, bool *isnull, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); -extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); +extern BatchIndexScan hashgetbatch(IndexScanDesc scan, + BatchIndexScan priorbatch, + ScanDirection dir); extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); +extern void hashfreebatch(IndexScanDesc scan, BatchIndexScan batch); extern void hashendscan(IndexScanDesc scan); extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, @@ -445,8 +387,9 @@ extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, uint32 lowmask); /* hashsearch.c */ -extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); -extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); +extern BatchIndexScan _hash_next(IndexScanDesc scan, ScanDirection dir, + BatchIndexScan priorbatch); +extern BatchIndexScan _hash_first(IndexScanDesc scan, ScanDirection dir); /* hashsort.c */ typedef struct HSpool HSpool; /* opaque struct in hashsort.c */ @@ -476,7 +419,7 @@ extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bu extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket); extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket); -extern void _hash_kill_items(IndexScanDesc scan); +extern void _hash_kill_items(IndexScanDesc scan, BatchIndexScan batch); /* hash.c */ extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index b4b460724..2f796aeac 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -101,9 +101,9 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->amadjustmembers = hashadjustmembers; amroutine->ambeginscan = hashbeginscan; amroutine->amrescan = hashrescan; - amroutine->amgettuple = hashgettuple; - amroutine->amgetbatch = NULL; - amroutine->amfreebatch = NULL; + amroutine->amgettuple = NULL; + amroutine->amgetbatch = hashgetbatch; + amroutine->amfreebatch = hashfreebatch; amroutine->amgetbitmap = hashgetbitmap; amroutine->amendscan = hashendscan; amroutine->amposreset = NULL; @@ -285,54 +285,22 @@ hashinsert(Relation rel, Datum *values, bool *isnull, /* - * hashgettuple() -- Get the next tuple in the scan. + * hashgetbatch() -- Get the first or next batch of tuples in the scan */ -bool -hashgettuple(IndexScanDesc scan, ScanDirection dir) +BatchIndexScan +hashgetbatch(IndexScanDesc scan, BatchIndexScan priorbatch, ScanDirection dir) { - HashScanOpaque so = (HashScanOpaque) scan->opaque; - bool res; - /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; - /* - * If we've already initialized this scan, we can just advance it in the - * appropriate direction. If we haven't done so yet, we call a routine to - * get the first item in the scan. - */ - if (!HashScanPosIsValid(so->currPos)) - res = _hash_first(scan, dir); - else + if (priorbatch == NULL) { - /* - * Check to see if we should kill the previously-fetched tuple. - */ - if (scan->kill_prior_tuple) - { - /* - * Yes, so remember it for later. (We'll deal with all such tuples - * at once right after leaving the index page or at end of scan.) - * In case if caller reverses the indexscan direction it is quite - * possible that the same item might get entered multiple times. - * But, we don't detect that; instead, we just forget any excess - * entries. - */ - if (so->killedItems == NULL) - so->killedItems = (int *) - palloc(MaxIndexTuplesPerPage * sizeof(int)); - - if (so->numKilled < MaxIndexTuplesPerPage) - so->killedItems[so->numKilled++] = so->currPos.itemIndex; - } - - /* - * Now continue the scan. - */ - res = _hash_next(scan, dir); + /* Initialize the scan, and return first batch of matching items */ + return _hash_first(scan, dir); } - return res; + /* Return batch positioned after caller's batch (in direction 'dir') */ + return _hash_next(scan, dir, priorbatch); } @@ -342,26 +310,23 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { - HashScanOpaque so = (HashScanOpaque) scan->opaque; - bool res; + BatchIndexScan batch; int64 ntids = 0; - HashScanPosItem *currItem; + int itemIndex; - res = _hash_first(scan, ForwardScanDirection); + batch = _hash_first(scan, ForwardScanDirection); - while (res) + while (batch != NULL) { - currItem = &so->currPos.items[so->currPos.itemIndex]; + for (itemIndex = batch->firstItem; + itemIndex <= batch->lastItem; + itemIndex++) + { + tbm_add_tuples(tbm, &(batch->items[itemIndex].heapTid), 1, true); + ntids++; + } - /* - * _hash_first and _hash_next handle eliminate dead index entries - * whenever scan->ignore_killed_tuples is true. Therefore, there's - * nothing to do here except add the results to the TIDBitmap. - */ - tbm_add_tuples(tbm, &(currItem->heapTid), 1, true); - ntids++; - - res = _hash_next(scan, ForwardScanDirection); + batch = _hash_next(scan, ForwardScanDirection, batch); } return ntids; @@ -383,16 +348,12 @@ hashbeginscan(Relation rel, int nkeys, int norderbys) scan = RelationGetIndexScan(rel, nkeys, norderbys); so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); - HashScanPosInvalidate(so->currPos); so->hashso_bucket_buf = InvalidBuffer; so->hashso_split_bucket_buf = InvalidBuffer; so->hashso_buc_populated = false; so->hashso_buc_split = false; - so->killedItems = NULL; - so->numKilled = 0; - scan->opaque = so; return scan; @@ -408,18 +369,8 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - if (HashScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _hash_kill_items(scan); - } - _hash_dropscanbuf(rel, so); - /* set position invalid (this will cause _hash_first call) */ - HashScanPosInvalidate(so->currPos); - /* Update scan key, if a new one is given */ if (scankey && scan->numberOfKeys > 0) memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); @@ -428,6 +379,28 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, so->hashso_buc_split = false; } +/* + * hashfreebatch() -- Free batch resources, including its buffer pin + */ +void +hashfreebatch(IndexScanDesc scan, BatchIndexScan batch) +{ + /* + * Check if there are tuples to kill from this batch (that weren't already + * killed earlier on) + */ + if (batch->numKilled > 0) + _hash_kill_items(scan, batch); + + if (!scan->batchqueue->dropPin) + { + ReleaseBuffer(batch->buf); + batch->buf = InvalidBuffer; + } + + indexam_util_batch_release(scan, batch); +} + /* * hashendscan() -- close down a scan */ @@ -437,17 +410,8 @@ hashendscan(IndexScanDesc scan) HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - if (HashScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _hash_kill_items(scan); - } - _hash_dropscanbuf(rel, so); - if (so->killedItems != NULL) - pfree(so->killedItems); pfree(so); scan->opaque = NULL; } diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index b8e5bd005..b0b0c530c 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -280,31 +280,24 @@ _hash_dropbuf(Relation rel, Buffer buf) } /* - * _hash_dropscanbuf() -- release buffers used in scan. + * _hash_dropscanbuf() -- release buffers owned by scan. * - * This routine unpins the buffers used during scan on which we - * hold no lock. + * This routine unpins the buffers for the primary bucket page and for the + * bucket page of a bucket being split as needed. */ void _hash_dropscanbuf(Relation rel, HashScanOpaque so) { /* release pin we hold on primary bucket page */ - if (BufferIsValid(so->hashso_bucket_buf) && - so->hashso_bucket_buf != so->currPos.buf) + if (BufferIsValid(so->hashso_bucket_buf)) _hash_dropbuf(rel, so->hashso_bucket_buf); so->hashso_bucket_buf = InvalidBuffer; - /* release pin we hold on primary bucket page of bucket being split */ - if (BufferIsValid(so->hashso_split_bucket_buf) && - so->hashso_split_bucket_buf != so->currPos.buf) + /* release pin held on primary bucket page of bucket being split */ + if (BufferIsValid(so->hashso_split_bucket_buf)) _hash_dropbuf(rel, so->hashso_split_bucket_buf); so->hashso_split_bucket_buf = InvalidBuffer; - /* release any pin we still hold */ - if (BufferIsValid(so->currPos.buf)) - _hash_dropbuf(rel, so->currPos.buf); - so->currPos.buf = InvalidBuffer; - /* reset split scan */ so->hashso_buc_populated = false; so->hashso_buc_split = false; diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 92c15a65b..ecfcf801f 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -22,104 +22,75 @@ #include "utils/rel.h" static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP, - ScanDirection dir); + ScanDirection dir, BatchIndexScan batch); static int _hash_load_qualified_items(IndexScanDesc scan, Page page, - OffsetNumber offnum, ScanDirection dir); -static inline void _hash_saveitem(HashScanOpaque so, int itemIndex, + OffsetNumber offnum, ScanDirection dir, + BatchIndexScan batch); +static inline void _hash_saveitem(BatchIndexScan batch, int itemIndex, OffsetNumber offnum, IndexTuple itup); static void _hash_readnext(IndexScanDesc scan, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep); /* - * _hash_next() -- Get the next item in a scan. + * _hash_next() -- Get the next batch of items in a scan. * - * On entry, so->currPos describes the current page, which may - * be pinned but not locked, and so->currPos.itemIndex identifies - * which item was previously returned. + * On entry, priorbatch describes the current page batch with items + * already returned. * - * On successful exit, scan->xs_heaptid is set to the TID of the next - * heap tuple. so->currPos is updated as needed. + * On successful exit, returns a new batch containing the next items from + * the next page (determined by priorbatch navigation info). * - * On failure exit (no more tuples), we return false with pin + * On failure exit (no more tuples), we return NULL with pin * held on bucket page but no pins or locks held on overflow * page. */ -bool -_hash_next(IndexScanDesc scan, ScanDirection dir) +BatchIndexScan +_hash_next(IndexScanDesc scan, ScanDirection dir, BatchIndexScan priorbatch) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; - HashScanPosItem *currItem; BlockNumber blkno; Buffer buf; - bool end_of_scan = false; + BatchIndexScan batch; /* - * Advance to the next tuple on the current page; or if done, try to read - * data from the next or previous page based on the scan direction. Before - * moving to the next or previous page make sure that we deal with all the - * killed items. + * Determine which page to read next based on scan direction and details + * taken from the prior batch */ if (ScanDirectionIsForward(dir)) { - if (++so->currPos.itemIndex > so->currPos.lastItem) + blkno = priorbatch->nextPage; + if (!BlockNumberIsValid(blkno) || !priorbatch->moreRight) { - if (so->numKilled > 0) - _hash_kill_items(scan); - - blkno = so->currPos.nextPage; - if (BlockNumberIsValid(blkno)) - { - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); - if (!_hash_readpage(scan, &buf, dir)) - end_of_scan = true; - } - else - end_of_scan = true; + _hash_dropscanbuf(rel, so); + return NULL; } } else { - if (--so->currPos.itemIndex < so->currPos.firstItem) + blkno = priorbatch->prevPage; + if (!BlockNumberIsValid(blkno) || !priorbatch->moreLeft) { - if (so->numKilled > 0) - _hash_kill_items(scan); - - blkno = so->currPos.prevPage; - if (BlockNumberIsValid(blkno)) - { - buf = _hash_getbuf(rel, blkno, HASH_READ, - LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); - - /* - * We always maintain the pin on bucket page for whole scan - * operation, so releasing the additional pin we have acquired - * here. - */ - if (buf == so->hashso_bucket_buf || - buf == so->hashso_split_bucket_buf) - _hash_dropbuf(rel, buf); - - if (!_hash_readpage(scan, &buf, dir)) - end_of_scan = true; - } - else - end_of_scan = true; + _hash_dropscanbuf(rel, so); + return NULL; } } - if (end_of_scan) + /* Allocate space for next batch */ + batch = indexam_util_batch_alloc(scan, MaxIndexTuplesPerPage, false); + + /* Get the buffer for next batch */ + buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); + + /* Read the next page and load items into allocated batch */ + if (!_hash_readpage(scan, &buf, dir, batch)) { - _hash_dropscanbuf(rel, so); - HashScanPosInvalidate(so->currPos); - return false; + indexam_util_batch_release(scan, batch); + return NULL; } - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_heaptid = currItem->heapTid; - - return true; + /* Return the batch containing matched items from next page */ + return batch; } /* @@ -269,22 +240,21 @@ _hash_readprev(IndexScanDesc scan, } /* - * _hash_first() -- Find the first item in a scan. + * _hash_first() -- Find the first batch of items in a scan. * - * We find the first item (or, if backward scan, the last item) in the - * index that satisfies the qualification associated with the scan - * descriptor. + * We find the first batch of items (or, if backward scan, the last + * batch) in the index that satisfies the qualification associated with + * the scan descriptor. * - * On successful exit, if the page containing current index tuple is an - * overflow page, both pin and lock are released whereas if it is a bucket - * page then it is pinned but not locked and data about the matching - * tuple(s) on the page has been loaded into so->currPos, - * scan->xs_heaptid is set to the heap TID of the current tuple. + * On successful exit, returns a batch containing matching items. If the + * page is a bucket page, it remains pinned but not locked. If it is an + * overflow page, both pin and lock are released and batch->buf is + * InvalidBuffer. * - * On failure exit (no more tuples), we return false, with pin held on + * On failure exit (no matching items), we return NULL, with pin held on * bucket page but no pins or locks held on overflow page. */ -bool +BatchIndexScan _hash_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; @@ -295,7 +265,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) Buffer buf; Page page; HashPageOpaque opaque; - HashScanPosItem *currItem; + BatchIndexScan batch; pgstat_count_index_scan(rel); if (scan->instrument) @@ -325,7 +295,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) * items in the index. */ if (cur->sk_flags & SK_ISNULL) - return false; + return NULL; /* * Okay to compute the hash key. We want to do this before acquiring any @@ -418,34 +388,33 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) _hash_readnext(scan, &buf, &page, &opaque); } - /* remember which buffer we have pinned, if any */ - Assert(BufferIsInvalid(so->currPos.buf)); - so->currPos.buf = buf; + /* Allocate space for first batch */ + batch = indexam_util_batch_alloc(scan, MaxIndexTuplesPerPage, false); - /* Now find all the tuples satisfying the qualification from a page */ - if (!_hash_readpage(scan, &buf, dir)) - return false; + /* Read the first page and load items into allocated batch */ + if (!_hash_readpage(scan, &buf, dir, batch)) + { + indexam_util_batch_release(scan, batch); + return NULL; + } - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_heaptid = currItem->heapTid; - - /* if we're here, _hash_readpage found a valid tuples */ - return true; + /* Return the batch containing matched items */ + return batch; } /* - * _hash_readpage() -- Load data from current index page into so->currPos + * _hash_readpage() -- Load data from current index page into batch * * We scan all the items in the current index page and save them into - * so->currPos if it satisfies the qualification. If no matching items + * the batch if they satisfy the qualification. If no matching items * are found in the current page, we move to the next or previous page * in a bucket chain as indicated by the direction. * * Return true if any matching items are found else return false. */ static bool -_hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +_hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, + BatchIndexScan batch) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; @@ -461,8 +430,8 @@ _hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) page = BufferGetPage(buf); opaque = HashPageGetOpaque(page); - so->currPos.buf = buf; - so->currPos.currPage = BufferGetBlockNumber(buf); + batch->buf = buf; + batch->currPage = BufferGetBlockNumber(buf); if (ScanDirectionIsForward(dir)) { @@ -473,25 +442,21 @@ _hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* new page, locate starting position by binary search */ offnum = _hash_binsearch(page, so->hashso_sk_hash); - itemIndex = _hash_load_qualified_items(scan, page, offnum, dir); + itemIndex = _hash_load_qualified_items(scan, page, offnum, dir, + batch); if (itemIndex != 0) break; /* * Could not find any matching tuples in the current page, move to - * the next page. Before leaving the current page, deal with any - * killed items. - */ - if (so->numKilled > 0) - _hash_kill_items(scan); - - /* + * the next page. + * * If this is a primary bucket page, hasho_prevblkno is not a real * block number. */ - if (so->currPos.buf == so->hashso_bucket_buf || - so->currPos.buf == so->hashso_split_bucket_buf) + if (batch->buf == so->hashso_bucket_buf || + batch->buf == so->hashso_split_bucket_buf) prev_blkno = InvalidBlockNumber; else prev_blkno = opaque->hasho_prevblkno; @@ -499,29 +464,25 @@ _hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) _hash_readnext(scan, &buf, &page, &opaque); if (BufferIsValid(buf)) { - so->currPos.buf = buf; - so->currPos.currPage = BufferGetBlockNumber(buf); + batch->buf = buf; + batch->currPage = BufferGetBlockNumber(buf); } else { /* * Remember next and previous block numbers for scrollable * cursors to know the start position and return false - * indicating that no more matching tuples were found. Also, - * don't reset currPage or lsn, because we expect - * _hash_kill_items to be called for the old page after this - * function returns. + * indicating that no more matching tuples were found */ - so->currPos.prevPage = prev_blkno; - so->currPos.nextPage = InvalidBlockNumber; - so->currPos.buf = buf; + batch->prevPage = prev_blkno; + batch->nextPage = InvalidBlockNumber; + batch->buf = buf; return false; } } - so->currPos.firstItem = 0; - so->currPos.lastItem = itemIndex - 1; - so->currPos.itemIndex = 0; + batch->firstItem = 0; + batch->lastItem = itemIndex - 1; } else { @@ -532,77 +493,91 @@ _hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* new page, locate starting position by binary search */ offnum = _hash_binsearch_last(page, so->hashso_sk_hash); - itemIndex = _hash_load_qualified_items(scan, page, offnum, dir); + itemIndex = _hash_load_qualified_items(scan, page, offnum, dir, + batch); if (itemIndex != MaxIndexTuplesPerPage) break; /* * Could not find any matching tuples in the current page, move to - * the previous page. Before leaving the current page, deal with - * any killed items. + * the previous page */ - if (so->numKilled > 0) - _hash_kill_items(scan); - - if (so->currPos.buf == so->hashso_bucket_buf || - so->currPos.buf == so->hashso_split_bucket_buf) + if (batch->buf == so->hashso_bucket_buf || + batch->buf == so->hashso_split_bucket_buf) next_blkno = opaque->hasho_nextblkno; _hash_readprev(scan, &buf, &page, &opaque); if (BufferIsValid(buf)) { - so->currPos.buf = buf; - so->currPos.currPage = BufferGetBlockNumber(buf); + batch->buf = buf; + batch->currPage = BufferGetBlockNumber(buf); } else { /* * Remember next and previous block numbers for scrollable * cursors to know the start position and return false - * indicating that no more matching tuples were found. Also, - * don't reset currPage or lsn, because we expect - * _hash_kill_items to be called for the old page after this - * function returns. + * indicating that no more matching tuples were found */ - so->currPos.prevPage = InvalidBlockNumber; - so->currPos.nextPage = next_blkno; - so->currPos.buf = buf; + batch->prevPage = InvalidBlockNumber; + batch->nextPage = next_blkno; + batch->buf = buf; return false; } } - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + batch->firstItem = itemIndex; + batch->lastItem = MaxIndexTuplesPerPage - 1; } - if (so->currPos.buf == so->hashso_bucket_buf || - so->currPos.buf == so->hashso_split_bucket_buf) + /* Saved at least one match in batch.items[], so prepare to return it */ + if (batch->buf == so->hashso_bucket_buf || + batch->buf == so->hashso_split_bucket_buf) { - so->currPos.prevPage = InvalidBlockNumber; - so->currPos.nextPage = opaque->hasho_nextblkno; - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + /* + * Batch's buffer is either the primary bucket, or a bucket being + * populated due to a split + */ + batch->prevPage = InvalidBlockNumber; + batch->nextPage = opaque->hasho_nextblkno; + batch->moreLeft = (ScanDirectionIsBackward(dir) && + BlockNumberIsValid(batch->prevPage)); + batch->moreRight = (ScanDirectionIsForward(dir) && + BlockNumberIsValid(batch->nextPage)); + + /* + * Increment local reference count so that batch gets an independent + * buffer reference that can be released (by hashfreebatch) before the + * hashso_bucket_buf/hashso_split_bucket_buf references are released + */ + IncrBufferRefCount(batch->buf); } else { - so->currPos.prevPage = opaque->hasho_prevblkno; - so->currPos.nextPage = opaque->hasho_nextblkno; - _hash_relbuf(rel, so->currPos.buf); - so->currPos.buf = InvalidBuffer; + batch->prevPage = opaque->hasho_prevblkno; + batch->nextPage = opaque->hasho_nextblkno; + batch->moreLeft = (ScanDirectionIsBackward(dir) && + BlockNumberIsValid(batch->prevPage)); + batch->moreRight = (ScanDirectionIsForward(dir) && + BlockNumberIsValid(batch->nextPage)); } - Assert(so->currPos.firstItem <= so->currPos.lastItem); + /* Unlock (and likely unpin) buffer, per amgetbatch contract */ + indexam_util_batch_unlock(scan, batch); + + Assert(batch->firstItem <= batch->lastItem); return true; } /* * Load all the qualified items from a current index page - * into so->currPos. Helper function for _hash_readpage. + * into batch. Helper function for _hash_readpage. */ static int _hash_load_qualified_items(IndexScanDesc scan, Page page, - OffsetNumber offnum, ScanDirection dir) + OffsetNumber offnum, ScanDirection dir, + BatchIndexScan batch) { HashScanOpaque so = (HashScanOpaque) scan->opaque; IndexTuple itup; @@ -639,7 +614,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, _hash_checkqual(scan, itup)) { /* tuple is qualified, so remember it */ - _hash_saveitem(so, itemIndex, offnum, itup); + _hash_saveitem(batch, itemIndex, offnum, itup); itemIndex++; } else @@ -686,7 +661,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, { itemIndex--; /* tuple is qualified, so remember it */ - _hash_saveitem(so, itemIndex, offnum, itup); + _hash_saveitem(batch, itemIndex, offnum, itup); } else { @@ -705,13 +680,14 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, } } -/* Save an index item into so->currPos.items[itemIndex] */ +/* Save an index item into batch->items[itemIndex] */ static inline void -_hash_saveitem(HashScanOpaque so, int itemIndex, +_hash_saveitem(BatchIndexScan batch, int itemIndex, OffsetNumber offnum, IndexTuple itup) { - HashScanPosItem *currItem = &so->currPos.items[itemIndex]; + BatchMatchingItem *currItem = &batch->items[itemIndex]; currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; + currItem->tupleOffset = 0; /* Hash doesn't support index-only scans */ } diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index f41233fcd..d01d3b7d2 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -510,67 +510,74 @@ _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, * _hash_kill_items - set LP_DEAD state for items an indexscan caller has * told us were killed. * - * scan->opaque, referenced locally through so, contains information about the - * current page and killed tuples thereon (generally, this should only be - * called if so->numKilled > 0). + * The batch parameter contains information about the current page and killed + * tuples thereon (this should only be called if batch->numKilled > 0). * - * The caller does not have a lock on the page and may or may not have the - * page pinned in a buffer. Note that read-lock is sufficient for setting - * LP_DEAD status (which is only a hint). + * Caller should not have a lock on the batch position's page, but must hold a + * buffer pin when !dropPin. When we return, it still won't be locked. It'll + * continue to hold whatever pins were held before calling here. * - * The caller must have pin on bucket buffer, but may or may not have pin - * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos). - * - * We match items by heap TID before assuming they are the right ones to - * delete. - * - * There are never any scans active in a bucket at the time VACUUM begins, - * because VACUUM takes a cleanup lock on the primary bucket page and scans - * hold a pin. A scan can begin after VACUUM leaves the primary bucket page - * but before it finishes the entire bucket, but it can never pass VACUUM, - * because VACUUM always locks the next page before releasing the lock on - * the previous one. Therefore, we don't have to worry about accidentally - * killing a TID that has been reused for an unrelated tuple. + * We match items by heap TID before assuming they are the right ones to set + * LP_DEAD. If the scan is one that holds a buffer pin on the target page + * continuously from initially reading the items until applying this function + * (if it is a !dropPin scan), VACUUM cannot have deleted any items on the + * page, so the page's TIDs can't have been recycled by now. There's no risk + * that we'll confuse a new index tuple that happens to use a recycled TID + * with a now-removed tuple with the same TID (that used to be on this same + * page). We can't rely on that during scans that drop buffer pins eagerly + * (i.e. dropPin scans), though, so we must condition setting LP_DEAD bits on + * the page LSN having not changed since back when _hash_readpage saw the page. + * We totally give up on setting LP_DEAD bits when the page LSN changed. */ void -_hash_kill_items(IndexScanDesc scan) +_hash_kill_items(IndexScanDesc scan, BatchIndexScan batch) { - HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - BlockNumber blkno; Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offnum, maxoff; - int numKilled = so->numKilled; + int numKilled = batch->numKilled; int i; bool killedsomething = false; - bool havePin = false; - Assert(so->numKilled > 0); - Assert(so->killedItems != NULL); - Assert(HashScanPosIsValid(so->currPos)); + Assert(numKilled > 0); + Assert(batch->killedItems != NULL); + Assert(BlockNumberIsValid(batch->currPage)); - /* - * Always reset the scan state, so we don't look for same items on other - * pages. - */ - so->numKilled = 0; + /* Always invalidate batch->killedItems[] before freeing batch */ + batch->numKilled = 0; - blkno = so->currPos.currPage; - if (HashScanPosIsPinned(so->currPos)) + if (!scan->batchqueue->dropPin) { /* - * We already have pin on this buffer, so, all we need to do is - * acquire lock on it. + * We have held the pin on this page since we read the index tuples, + * so all we need to do is lock it. The pin will have prevented + * concurrent VACUUMs from recycling any of the TIDs on the page. */ - havePin = true; - buf = so->currPos.buf; + buf = batch->buf; LockBuffer(buf, BUFFER_LOCK_SHARE); } else - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); + { + XLogRecPtr latestlsn; + + Assert(RelationNeedsWAL(rel)); + buf = _hash_getbuf(rel, batch->currPage, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + + latestlsn = BufferGetLSNAtomic(buf); + Assert(batch->lsn <= latestlsn); + if (batch->lsn != latestlsn) + { + /* Modified, give up on hinting */ + _hash_relbuf(rel, buf); + return; + } + + /* Unmodified, hinting is safe */ + } page = BufferGetPage(buf); opaque = HashPageGetOpaque(page); @@ -578,13 +585,13 @@ _hash_kill_items(IndexScanDesc scan) for (i = 0; i < numKilled; i++) { - int itemIndex = so->killedItems[i]; - HashScanPosItem *currItem = &so->currPos.items[itemIndex]; + int itemIndex = batch->killedItems[i]; + BatchMatchingItem *currItem = &batch->items[itemIndex]; offnum = currItem->indexOffset; - Assert(itemIndex >= so->currPos.firstItem && - itemIndex <= so->currPos.lastItem); + Assert(itemIndex >= batch->firstItem && + itemIndex <= batch->lastItem); while (offnum <= maxoff) { @@ -613,9 +620,8 @@ _hash_kill_items(IndexScanDesc scan) MarkBufferDirtyHint(buf, true); } - if (so->hashso_bucket_buf == so->currPos.buf || - havePin) - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + if (!scan->batchqueue->dropPin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); } diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 6f1ebb955..253f08b40 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1183,8 +1183,6 @@ HashPageStat HashPath HashScanOpaque HashScanOpaqueData -HashScanPosData -HashScanPosItem HashSkewBucket HashState HashValueFunc -- 2.51.0