From 49ab6c51288983e2e125292f006a76d247ccc400 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 25 Nov 2025 18:03:15 -0500 Subject: [PATCH v6 4/4] Make hash index AM use amgetbatch interface. Replace hashgettuple with hashgetbatch, a function that implements the new amgetbatch interface. Plain index scans of hash indexes now return matching items in batches consisting of all of the matches from a given bucket or overflow page. This gives the core executor the ability to perform optimizations like index prefetching during hash index scans. Note that this makes hash index scans use the dropPin optimization, since that is required to use the amgetbatch interface. This won't avoid making hash index vacuuming wait for a cleanup lock when an index scan holds onto a conflicting pin, since such index scans still need to hold onto a conflicting bucket page pin for the duration of the scan. In other words, use of the dropPin optimization won't benefit hash index scans in the same way that it benefits nbtree index scans following commit 2ed5b87f. However, there is still some value in keeping the number of buffer pins held at any one time to a minimum. Index prefetching tends to hold open as many as several dozen batches with certain workloads (workloads where the stream position has to get quite far ahead of the read position in order to maintain the appropriate prefetch distance on the heapam side). Guaranteeing that open batches won't hold buffer pins on index pages (at least in the common case where the dropPin optimization is safe to use) thereby simplifies resource management during index prefetching. Also add Valgrind buffer lock instrumentation to hash, bringing it in line with nbtree following commit 4a70f829. This is another requirement when using the amgetbatch interface. Author: Peter Geoghegan Discussion: https://postgr.es/m/CAH2-WzmYqhacBH161peAWb5eF=Ja7CFAQ+0jSEMq=qnfLVTOOg@mail.gmail.com --- src/include/access/hash.h | 73 +----- src/backend/access/hash/hash.c | 123 ++++------ src/backend/access/hash/hashpage.c | 26 +-- src/backend/access/hash/hashsearch.c | 328 ++++++++++++--------------- src/backend/access/hash/hashutil.c | 100 ++++---- src/tools/pgindent/typedefs.list | 2 - 6 files changed, 255 insertions(+), 397 deletions(-) diff --git a/src/include/access/hash.h b/src/include/access/hash.h index a8702f0e5..1b80fd7ed 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -100,58 +100,6 @@ typedef HashPageOpaqueData *HashPageOpaque; */ #define HASHO_PAGE_ID 0xFF80 -typedef struct HashScanPosItem /* what we remember about each match */ -{ - ItemPointerData heapTid; /* TID of referenced heap item */ - OffsetNumber indexOffset; /* index item's location within page */ -} HashScanPosItem; - -typedef struct HashScanPosData -{ - Buffer buf; /* if valid, the buffer is pinned */ - BlockNumber currPage; /* current hash index page */ - BlockNumber nextPage; /* next overflow page */ - BlockNumber prevPage; /* prev overflow or bucket page */ - - /* - * The items array is always ordered in index order (ie, increasing - * indexoffset). When scanning backwards it is convenient to fill the - * array back-to-front, so we start at the last slot and fill downwards. - * Hence we need both a first-valid-entry and a last-valid-entry counter. - * itemIndex is a cursor showing which entry was last returned to caller. - */ - int firstItem; /* first valid index in items[] */ - int lastItem; /* last valid index in items[] */ - int itemIndex; /* current index in items[] */ - - HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ -} HashScanPosData; - -#define HashScanPosIsPinned(scanpos) \ -( \ - AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ - !BufferIsValid((scanpos).buf)), \ - BufferIsValid((scanpos).buf) \ -) - -#define HashScanPosIsValid(scanpos) \ -( \ - AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ - !BufferIsValid((scanpos).buf)), \ - BlockNumberIsValid((scanpos).currPage) \ -) - -#define HashScanPosInvalidate(scanpos) \ - do { \ - (scanpos).buf = InvalidBuffer; \ - (scanpos).currPage = InvalidBlockNumber; \ - (scanpos).nextPage = InvalidBlockNumber; \ - (scanpos).prevPage = InvalidBlockNumber; \ - (scanpos).firstItem = 0; \ - (scanpos).lastItem = 0; \ - (scanpos).itemIndex = 0; \ - } while (0) - /* * HashScanOpaqueData is private state for a hash index scan. */ @@ -178,15 +126,6 @@ typedef struct HashScanOpaqueData * referred only when hashso_buc_populated is true. */ bool hashso_buc_split; - /* info about killed items if any (killedItems is NULL if never used) */ - int *killedItems; /* currPos.items indexes of killed items */ - int numKilled; /* number of currently stored items */ - - /* - * Identify all the matching items on a page and save them in - * HashScanPosData - */ - HashScanPosData currPos; /* current position data */ } HashScanOpaqueData; typedef HashScanOpaqueData *HashScanOpaque; @@ -368,11 +307,14 @@ extern bool hashinsert(Relation rel, Datum *values, bool *isnull, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); -extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); +extern BatchIndexScan hashgetbatch(IndexScanDesc scan, + BatchIndexScan priorbatch, + ScanDirection dir); extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); +extern void hashfreebatch(IndexScanDesc scan, BatchIndexScan batch); extern void hashendscan(IndexScanDesc scan); extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, @@ -445,8 +387,9 @@ extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, uint32 lowmask); /* hashsearch.c */ -extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); -extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); +extern BatchIndexScan _hash_next(IndexScanDesc scan, ScanDirection dir, + BatchIndexScan priorbatch); +extern BatchIndexScan _hash_first(IndexScanDesc scan, ScanDirection dir); /* hashsort.c */ typedef struct HSpool HSpool; /* opaque struct in hashsort.c */ @@ -476,7 +419,7 @@ extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bu extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket); extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket); -extern void _hash_kill_items(IndexScanDesc scan); +extern void _hash_kill_items(IndexScanDesc scan, BatchIndexScan batch); /* hash.c */ extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 6a20b67f6..d5f1c4f8c 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -101,9 +101,9 @@ hashhandler(PG_FUNCTION_ARGS) .amadjustmembers = hashadjustmembers, .ambeginscan = hashbeginscan, .amrescan = hashrescan, - .amgettuple = hashgettuple, - .amgetbatch = NULL, - .amfreebatch = NULL, + .amgettuple = NULL, + .amgetbatch = hashgetbatch, + .amfreebatch = hashfreebatch, .amgetbitmap = hashgetbitmap, .amendscan = hashendscan, .amposreset = NULL, @@ -286,53 +286,22 @@ hashinsert(Relation rel, Datum *values, bool *isnull, /* - * hashgettuple() -- Get the next tuple in the scan. + * hashgetbatch() -- Get the first or next batch of tuples in the scan */ -bool -hashgettuple(IndexScanDesc scan, ScanDirection dir) +BatchIndexScan +hashgetbatch(IndexScanDesc scan, BatchIndexScan priorbatch, ScanDirection dir) { - HashScanOpaque so = (HashScanOpaque) scan->opaque; - bool res; - /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; - /* - * If we've already initialized this scan, we can just advance it in the - * appropriate direction. If we haven't done so yet, we call a routine to - * get the first item in the scan. - */ - if (!HashScanPosIsValid(so->currPos)) - res = _hash_first(scan, dir); - else + if (priorbatch == NULL) { - /* - * Check to see if we should kill the previously-fetched tuple. - */ - if (scan->kill_prior_tuple) - { - /* - * Yes, so remember it for later. (We'll deal with all such tuples - * at once right after leaving the index page or at end of scan.) - * In case if caller reverses the indexscan direction it is quite - * possible that the same item might get entered multiple times. - * But, we don't detect that; instead, we just forget any excess - * entries. - */ - if (so->killedItems == NULL) - so->killedItems = palloc_array(int, MaxIndexTuplesPerPage); - - if (so->numKilled < MaxIndexTuplesPerPage) - so->killedItems[so->numKilled++] = so->currPos.itemIndex; - } - - /* - * Now continue the scan. - */ - res = _hash_next(scan, dir); + /* Initialize the scan, and return first batch of matching items */ + return _hash_first(scan, dir); } - return res; + /* Return batch positioned after caller's batch (in direction 'dir') */ + return _hash_next(scan, dir, priorbatch); } @@ -342,26 +311,23 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { - HashScanOpaque so = (HashScanOpaque) scan->opaque; - bool res; + BatchIndexScan batch; int64 ntids = 0; - HashScanPosItem *currItem; + int itemIndex; - res = _hash_first(scan, ForwardScanDirection); + batch = _hash_first(scan, ForwardScanDirection); - while (res) + while (batch != NULL) { - currItem = &so->currPos.items[so->currPos.itemIndex]; + for (itemIndex = batch->firstItem; + itemIndex <= batch->lastItem; + itemIndex++) + { + tbm_add_tuples(tbm, &batch->items[itemIndex].heapTid, 1, true); + ntids++; + } - /* - * _hash_first and _hash_next handle eliminate dead index entries - * whenever scan->ignore_killed_tuples is true. Therefore, there's - * nothing to do here except add the results to the TIDBitmap. - */ - tbm_add_tuples(tbm, &(currItem->heapTid), 1, true); - ntids++; - - res = _hash_next(scan, ForwardScanDirection); + batch = _hash_next(scan, ForwardScanDirection, batch); } return ntids; @@ -383,17 +349,14 @@ hashbeginscan(Relation rel, int nkeys, int norderbys) scan = RelationGetIndexScan(rel, nkeys, norderbys); so = (HashScanOpaque) palloc_object(HashScanOpaqueData); - HashScanPosInvalidate(so->currPos); so->hashso_bucket_buf = InvalidBuffer; so->hashso_split_bucket_buf = InvalidBuffer; so->hashso_buc_populated = false; so->hashso_buc_split = false; - so->killedItems = NULL; - so->numKilled = 0; - scan->opaque = so; + scan->maxitemsbatch = MaxIndexTuplesPerPage; return scan; } @@ -408,18 +371,8 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - if (HashScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _hash_kill_items(scan); - } - _hash_dropscanbuf(rel, so); - /* set position invalid (this will cause _hash_first call) */ - HashScanPosInvalidate(so->currPos); - /* Update scan key, if a new one is given */ if (scankey && scan->numberOfKeys > 0) memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); @@ -428,6 +381,25 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, so->hashso_buc_split = false; } +/* + * hashfreebatch() -- Free batch resources, including its buffer pin + */ +void +hashfreebatch(IndexScanDesc scan, BatchIndexScan batch) +{ + if (batch->numKilled > 0) + _hash_kill_items(scan, batch); + + if (!scan->dropPin) + { + /* indexam_util_batch_unlock didn't unpin page earlier, do it now */ + ReleaseBuffer(batch->buf); + batch->buf = InvalidBuffer; + } + + indexam_util_batch_release(scan, batch); +} + /* * hashendscan() -- close down a scan */ @@ -437,17 +409,8 @@ hashendscan(IndexScanDesc scan) HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - if (HashScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _hash_kill_items(scan); - } - _hash_dropscanbuf(rel, so); - if (so->killedItems != NULL) - pfree(so->killedItems); pfree(so); scan->opaque = NULL; } diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 8e220a3ae..9b6911905 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -35,6 +35,7 @@ #include "port/pg_bitutils.h" #include "storage/predicate.h" #include "storage/smgr.h" +#include "utils/memdebug.h" #include "utils/rel.h" static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, @@ -79,6 +80,9 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags) if (access != HASH_NOLOCK) LockBuffer(buf, access); + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + /* ref count and lock type are correct */ _hash_checkpage(rel, buf, flags); @@ -108,6 +112,9 @@ _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags) return InvalidBuffer; } + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + /* ref count and lock type are correct */ _hash_checkpage(rel, buf, flags); @@ -280,31 +287,24 @@ _hash_dropbuf(Relation rel, Buffer buf) } /* - * _hash_dropscanbuf() -- release buffers used in scan. + * _hash_dropscanbuf() -- release buffers owned by scan. * - * This routine unpins the buffers used during scan on which we - * hold no lock. + * This routine unpins the buffers for the primary bucket page and for the + * bucket page of a bucket being split as needed. */ void _hash_dropscanbuf(Relation rel, HashScanOpaque so) { /* release pin we hold on primary bucket page */ - if (BufferIsValid(so->hashso_bucket_buf) && - so->hashso_bucket_buf != so->currPos.buf) + if (BufferIsValid(so->hashso_bucket_buf)) _hash_dropbuf(rel, so->hashso_bucket_buf); so->hashso_bucket_buf = InvalidBuffer; - /* release pin we hold on primary bucket page of bucket being split */ - if (BufferIsValid(so->hashso_split_bucket_buf) && - so->hashso_split_bucket_buf != so->currPos.buf) + /* release pin held on primary bucket page of bucket being split */ + if (BufferIsValid(so->hashso_split_bucket_buf)) _hash_dropbuf(rel, so->hashso_split_bucket_buf); so->hashso_split_bucket_buf = InvalidBuffer; - /* release any pin we still hold */ - if (BufferIsValid(so->currPos.buf)) - _hash_dropbuf(rel, so->currPos.buf); - so->currPos.buf = InvalidBuffer; - /* reset split scan */ so->hashso_buc_populated = false; so->hashso_buc_split = false; diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index dfb0517b3..2602f3e64 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -21,105 +21,89 @@ #include "storage/predicate.h" #include "utils/rel.h" -static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP, - ScanDirection dir); +static bool _hash_readpage(IndexScanDesc scan, Buffer buf, ScanDirection dir, + BatchIndexScan batch); static int _hash_load_qualified_items(IndexScanDesc scan, Page page, - OffsetNumber offnum, ScanDirection dir); -static inline void _hash_saveitem(HashScanOpaque so, int itemIndex, + OffsetNumber offnum, ScanDirection dir, + BatchIndexScan batch); +static inline void _hash_saveitem(BatchIndexScan batch, int itemIndex, OffsetNumber offnum, IndexTuple itup); static void _hash_readnext(IndexScanDesc scan, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep); /* - * _hash_next() -- Get the next item in a scan. + * _hash_next() -- Get the next batch of items in a scan. * - * On entry, so->currPos describes the current page, which may - * be pinned but not locked, and so->currPos.itemIndex identifies - * which item was previously returned. + * On entry, priorbatch describes the current page batch with items + * already returned. * - * On successful exit, scan->xs_heaptid is set to the TID of the next - * heap tuple. so->currPos is updated as needed. + * On successful exit, returns a batch containing matching items from + * next page. Otherwise returns NULL, indicating that there are no + * further matches. No locks are ever held when we return. * - * On failure exit (no more tuples), we return false with pin - * held on bucket page but no pins or locks held on overflow - * page. + * Retains pins according to the same rules as _hash_first. */ -bool -_hash_next(IndexScanDesc scan, ScanDirection dir) +BatchIndexScan +_hash_next(IndexScanDesc scan, ScanDirection dir, BatchIndexScan priorbatch) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; - HashScanPosItem *currItem; BlockNumber blkno; Buffer buf; - bool end_of_scan = false; + BatchIndexScan batch; /* - * Advance to the next tuple on the current page; or if done, try to read - * data from the next or previous page based on the scan direction. Before - * moving to the next or previous page make sure that we deal with all the - * killed items. + * Determine which page to read next based on scan direction and details + * taken from the prior batch */ if (ScanDirectionIsForward(dir)) { - if (++so->currPos.itemIndex > so->currPos.lastItem) + blkno = priorbatch->nextPage; + if (!BlockNumberIsValid(blkno) || !priorbatch->moreRight) { - if (so->numKilled > 0) - _hash_kill_items(scan); - - blkno = so->currPos.nextPage; - if (BlockNumberIsValid(blkno)) - { - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); - if (!_hash_readpage(scan, &buf, dir)) - end_of_scan = true; - } - else - end_of_scan = true; + _hash_dropscanbuf(rel, so); + return NULL; } } else { - if (--so->currPos.itemIndex < so->currPos.firstItem) + blkno = priorbatch->prevPage; + if (!BlockNumberIsValid(blkno) || !priorbatch->moreLeft) { - if (so->numKilled > 0) - _hash_kill_items(scan); - - blkno = so->currPos.prevPage; - if (BlockNumberIsValid(blkno)) - { - buf = _hash_getbuf(rel, blkno, HASH_READ, - LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); - - /* - * We always maintain the pin on bucket page for whole scan - * operation, so releasing the additional pin we have acquired - * here. - */ - if (buf == so->hashso_bucket_buf || - buf == so->hashso_split_bucket_buf) - _hash_dropbuf(rel, buf); - - if (!_hash_readpage(scan, &buf, dir)) - end_of_scan = true; - } - else - end_of_scan = true; + _hash_dropscanbuf(rel, so); + return NULL; } } - if (end_of_scan) + /* Allocate space for next batch */ + batch = indexam_util_batch_alloc(scan); + + /* Get the buffer for next batch */ + if (ScanDirectionIsForward(dir)) + buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); + else { - _hash_dropscanbuf(rel, so); - HashScanPosInvalidate(so->currPos); - return false; + buf = _hash_getbuf(rel, blkno, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + + /* + * We always maintain the pin on bucket page for whole scan operation, + * so releasing the additional pin we have acquired here. + */ + if (buf == so->hashso_bucket_buf || + buf == so->hashso_split_bucket_buf) + _hash_dropbuf(rel, buf); } - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_heaptid = currItem->heapTid; + /* Read the next page and load items into allocated batch */ + if (!_hash_readpage(scan, buf, dir, batch)) + { + indexam_util_batch_release(scan, batch); + return NULL; + } - return true; + /* Return the batch containing matched items from next page */ + return batch; } /* @@ -269,22 +253,21 @@ _hash_readprev(IndexScanDesc scan, } /* - * _hash_first() -- Find the first item in a scan. + * _hash_first() -- Find the first batch of items in a scan. * - * We find the first item (or, if backward scan, the last item) in the - * index that satisfies the qualification associated with the scan - * descriptor. + * We find the first batch of items (or, if backward scan, the last + * batch) in the index that satisfies the qualification associated with + * the scan descriptor. * - * On successful exit, if the page containing current index tuple is an - * overflow page, both pin and lock are released whereas if it is a bucket - * page then it is pinned but not locked and data about the matching - * tuple(s) on the page has been loaded into so->currPos, - * scan->xs_heaptid is set to the heap TID of the current tuple. + * On successful exit, returns a batch containing matching items. + * Otherwise returns NULL, indicating that there are no further matches. + * No locks are ever held when we return. * - * On failure exit (no more tuples), we return false, with pin held on - * bucket page but no pins or locks held on overflow page. + * We always retain our own pin on the bucket page. When we return a + * batch with a bucket page, it will retain its own reference pin iff + * indexam_util_batch_release determined that table AM requires one. */ -bool +BatchIndexScan _hash_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; @@ -295,7 +278,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) Buffer buf; Page page; HashPageOpaque opaque; - HashScanPosItem *currItem; + BatchIndexScan batch; pgstat_count_index_scan(rel); if (scan->instrument) @@ -325,7 +308,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) * items in the index. */ if (cur->sk_flags & SK_ISNULL) - return false; + return NULL; /* * Okay to compute the hash key. We want to do this before acquiring any @@ -418,191 +401,159 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) _hash_readnext(scan, &buf, &page, &opaque); } - /* remember which buffer we have pinned, if any */ - Assert(BufferIsInvalid(so->currPos.buf)); - so->currPos.buf = buf; + /* Allocate space for first batch */ + batch = indexam_util_batch_alloc(scan); - /* Now find all the tuples satisfying the qualification from a page */ - if (!_hash_readpage(scan, &buf, dir)) - return false; + /* Read the first page and load items into allocated batch */ + if (!_hash_readpage(scan, buf, dir, batch)) + { + indexam_util_batch_release(scan, batch); + return NULL; + } - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_heaptid = currItem->heapTid; - - /* if we're here, _hash_readpage found a valid tuples */ - return true; + /* Return the batch containing matched items */ + return batch; } /* - * _hash_readpage() -- Load data from current index page into so->currPos + * _hash_readpage() -- Load data from current index page into batch * * We scan all the items in the current index page and save them into - * so->currPos if it satisfies the qualification. If no matching items + * the batch if they satisfy the qualification. If no matching items * are found in the current page, we move to the next or previous page * in a bucket chain as indicated by the direction. * * Return true if any matching items are found else return false. */ static bool -_hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +_hash_readpage(IndexScanDesc scan, Buffer buf, ScanDirection dir, + BatchIndexScan batch) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; - Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offnum; uint16 itemIndex; - buf = *bufP; Assert(BufferIsValid(buf)); _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); opaque = HashPageGetOpaque(page); - so->currPos.buf = buf; - so->currPos.currPage = BufferGetBlockNumber(buf); + batch->buf = buf; + batch->currPage = BufferGetBlockNumber(buf); + batch->dir = dir; if (ScanDirectionIsForward(dir)) { - BlockNumber prev_blkno = InvalidBlockNumber; - for (;;) { /* new page, locate starting position by binary search */ offnum = _hash_binsearch(page, so->hashso_sk_hash); - itemIndex = _hash_load_qualified_items(scan, page, offnum, dir); + itemIndex = _hash_load_qualified_items(scan, page, offnum, dir, + batch); if (itemIndex != 0) break; /* - * Could not find any matching tuples in the current page, move to - * the next page. Before leaving the current page, deal with any - * killed items. + * Could not find any matching tuples in the current page, try to + * move to the next page */ - if (so->numKilled > 0) - _hash_kill_items(scan); - - /* - * If this is a primary bucket page, hasho_prevblkno is not a real - * block number. - */ - if (so->currPos.buf == so->hashso_bucket_buf || - so->currPos.buf == so->hashso_split_bucket_buf) - prev_blkno = InvalidBlockNumber; - else - prev_blkno = opaque->hasho_prevblkno; - _hash_readnext(scan, &buf, &page, &opaque); - if (BufferIsValid(buf)) + if (!BufferIsValid(buf)) { - so->currPos.buf = buf; - so->currPos.currPage = BufferGetBlockNumber(buf); - } - else - { - /* - * Remember next and previous block numbers for scrollable - * cursors to know the start position and return false - * indicating that no more matching tuples were found. Also, - * don't reset currPage or lsn, because we expect - * _hash_kill_items to be called for the old page after this - * function returns. - */ - so->currPos.prevPage = prev_blkno; - so->currPos.nextPage = InvalidBlockNumber; - so->currPos.buf = buf; + batch->buf = InvalidBuffer; return false; } + + batch->buf = buf; + batch->currPage = BufferGetBlockNumber(buf); } - so->currPos.firstItem = 0; - so->currPos.lastItem = itemIndex - 1; - so->currPos.itemIndex = 0; + batch->firstItem = 0; + batch->lastItem = itemIndex - 1; } else { - BlockNumber next_blkno = InvalidBlockNumber; - for (;;) { /* new page, locate starting position by binary search */ offnum = _hash_binsearch_last(page, so->hashso_sk_hash); - itemIndex = _hash_load_qualified_items(scan, page, offnum, dir); + itemIndex = _hash_load_qualified_items(scan, page, offnum, dir, + batch); if (itemIndex != MaxIndexTuplesPerPage) break; /* - * Could not find any matching tuples in the current page, move to - * the previous page. Before leaving the current page, deal with - * any killed items. + * Could not find any matching tuples in the current page, try to + * move to the previous page */ - if (so->numKilled > 0) - _hash_kill_items(scan); - - if (so->currPos.buf == so->hashso_bucket_buf || - so->currPos.buf == so->hashso_split_bucket_buf) - next_blkno = opaque->hasho_nextblkno; - _hash_readprev(scan, &buf, &page, &opaque); - if (BufferIsValid(buf)) + if (!BufferIsValid(buf)) { - so->currPos.buf = buf; - so->currPos.currPage = BufferGetBlockNumber(buf); - } - else - { - /* - * Remember next and previous block numbers for scrollable - * cursors to know the start position and return false - * indicating that no more matching tuples were found. Also, - * don't reset currPage or lsn, because we expect - * _hash_kill_items to be called for the old page after this - * function returns. - */ - so->currPos.prevPage = InvalidBlockNumber; - so->currPos.nextPage = next_blkno; - so->currPos.buf = buf; + batch->buf = InvalidBuffer; return false; } + + batch->buf = buf; + batch->currPage = BufferGetBlockNumber(buf); } - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + batch->firstItem = itemIndex; + batch->lastItem = MaxIndexTuplesPerPage - 1; } - if (so->currPos.buf == so->hashso_bucket_buf || - so->currPos.buf == so->hashso_split_bucket_buf) + /* + * Saved at least one match in batch.items[]. Prepare for hashgetbatch to + * return it by initializing remaining uninitialized fields. + */ + if (batch->buf == so->hashso_bucket_buf || + batch->buf == so->hashso_split_bucket_buf) { - so->currPos.prevPage = InvalidBlockNumber; - so->currPos.nextPage = opaque->hasho_nextblkno; - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + /* + * Batch's buffer is either the primary bucket, or a bucket being + * populated due to a split. + * + * Increment local reference count so that batch gets an independent + * buffer reference that can be released (by hashfreebatch) before the + * hashso_bucket_buf/hashso_split_bucket_buf references are released. + */ + IncrBufferRefCount(batch->buf); + + /* Can only use opaque->hasho_nextblkno */ + batch->prevPage = InvalidBlockNumber; + batch->nextPage = opaque->hasho_nextblkno; } else { - so->currPos.prevPage = opaque->hasho_prevblkno; - so->currPos.nextPage = opaque->hasho_nextblkno; - _hash_relbuf(rel, so->currPos.buf); - so->currPos.buf = InvalidBuffer; + /* Can use opaque->hasho_prevblkno and opaque->hasho_nextblkno */ + batch->prevPage = opaque->hasho_prevblkno; + batch->nextPage = opaque->hasho_nextblkno; } - Assert(so->currPos.firstItem <= so->currPos.lastItem); + batch->moreLeft = BlockNumberIsValid(batch->prevPage); + batch->moreRight = BlockNumberIsValid(batch->nextPage); + + /* Unlock (and likely unpin) buffer, per amgetbatch contract */ + indexam_util_batch_unlock(scan, batch); + + Assert(batch->firstItem <= batch->lastItem); return true; } /* * Load all the qualified items from a current index page - * into so->currPos. Helper function for _hash_readpage. + * into batch. Helper function for _hash_readpage. */ static int _hash_load_qualified_items(IndexScanDesc scan, Page page, - OffsetNumber offnum, ScanDirection dir) + OffsetNumber offnum, ScanDirection dir, + BatchIndexScan batch) { HashScanOpaque so = (HashScanOpaque) scan->opaque; IndexTuple itup; @@ -639,7 +590,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, _hash_checkqual(scan, itup)) { /* tuple is qualified, so remember it */ - _hash_saveitem(so, itemIndex, offnum, itup); + _hash_saveitem(batch, itemIndex, offnum, itup); itemIndex++; } else @@ -686,7 +637,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, { itemIndex--; /* tuple is qualified, so remember it */ - _hash_saveitem(so, itemIndex, offnum, itup); + _hash_saveitem(batch, itemIndex, offnum, itup); } else { @@ -705,13 +656,14 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, } } -/* Save an index item into so->currPos.items[itemIndex] */ +/* Save an index item into batch->items[itemIndex] */ static inline void -_hash_saveitem(HashScanOpaque so, int itemIndex, +_hash_saveitem(BatchIndexScan batch, int itemIndex, OffsetNumber offnum, IndexTuple itup) { - HashScanPosItem *currItem = &so->currPos.items[itemIndex]; + BatchMatchingItem *currItem = &batch->items[itemIndex]; currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; + currItem->tupleOffset = 0; } diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index cf7f0b901..f99105d3b 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -510,81 +510,84 @@ _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, * _hash_kill_items - set LP_DEAD state for items an indexscan caller has * told us were killed. * - * scan->opaque, referenced locally through so, contains information about the - * current page and killed tuples thereon (generally, this should only be - * called if so->numKilled > 0). + * The batch parameter contains information about the current page and killed + * tuples thereon (this should only be called if batch->numKilled > 0). * - * The caller does not have a lock on the page and may or may not have the - * page pinned in a buffer. Note that read-lock is sufficient for setting - * LP_DEAD status (which is only a hint). + * Caller should not have a lock on the batch position's page, but must hold a + * buffer pin when !dropPin. When we return, it still won't be locked. It'll + * continue to hold whatever pins were held before calling here. * - * The caller must have pin on bucket buffer, but may or may not have pin - * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos). - * - * We match items by heap TID before assuming they are the right ones to - * delete. - * - * There are never any scans active in a bucket at the time VACUUM begins, - * because VACUUM takes a cleanup lock on the primary bucket page and scans - * hold a pin. A scan can begin after VACUUM leaves the primary bucket page - * but before it finishes the entire bucket, but it can never pass VACUUM, - * because VACUUM always locks the next page before releasing the lock on - * the previous one. Therefore, we don't have to worry about accidentally - * killing a TID that has been reused for an unrelated tuple. + * We match items by heap TID before assuming they are the right ones to set + * LP_DEAD. If the scan is one that holds a buffer pin on the target page + * continuously from initially reading the items until applying this function + * (if it is a !dropPin scan), VACUUM cannot have deleted any items on the + * page, so the page's TIDs can't have been recycled by now. There's no risk + * that we'll confuse a new index tuple that happens to use a recycled TID + * with a now-removed tuple with the same TID (that used to be on this same + * page). We can't rely on that during scans that drop buffer pins eagerly + * (i.e. dropPin scans), though, so we must condition setting LP_DEAD bits on + * the page LSN having not changed since back when _hash_readpage saw the page. + * We totally give up on setting LP_DEAD bits when the page LSN changed. */ void -_hash_kill_items(IndexScanDesc scan) +_hash_kill_items(IndexScanDesc scan, BatchIndexScan batch) { - HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - BlockNumber blkno; Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offnum, maxoff; - int numKilled = so->numKilled; - int i; bool killedsomething = false; - bool havePin = false; - Assert(so->numKilled > 0); - Assert(so->killedItems != NULL); - Assert(HashScanPosIsValid(so->currPos)); + Assert(batch->numKilled > 0); + Assert(batch->killedItems != NULL); + Assert(BlockNumberIsValid(batch->currPage)); - /* - * Always reset the scan state, so we don't look for same items on other - * pages. - */ - so->numKilled = 0; - - blkno = so->currPos.currPage; - if (HashScanPosIsPinned(so->currPos)) + if (!scan->dropPin) { /* - * We already have pin on this buffer, so, all we need to do is - * acquire lock on it. + * We have held the pin on this page since we read the index tuples, + * so all we need to do is lock it. The pin will have prevented + * concurrent VACUUMs from recycling any of the TIDs on the page. */ - havePin = true; - buf = so->currPos.buf; + buf = batch->buf; LockBuffer(buf, BUFFER_LOCK_SHARE); } else - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); + { + XLogRecPtr latestlsn; + + Assert(RelationNeedsWAL(rel)); + buf = _hash_getbuf(rel, batch->currPage, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + + latestlsn = BufferGetLSNAtomic(buf); + Assert(batch->lsn <= latestlsn); + if (batch->lsn != latestlsn) + { + /* Modified, give up on hinting */ + _hash_relbuf(rel, buf); + return; + } + + /* Unmodified, hinting is safe */ + } page = BufferGetPage(buf); opaque = HashPageGetOpaque(page); maxoff = PageGetMaxOffsetNumber(page); - for (i = 0; i < numKilled; i++) + /* Iterate through batch->killedItems[] in index page order */ + for (int i = 0; i < batch->numKilled; i++) { - int itemIndex = so->killedItems[i]; - HashScanPosItem *currItem = &so->currPos.items[itemIndex]; + int itemIndex = batch->killedItems[i]; + BatchMatchingItem *currItem = &batch->items[itemIndex]; offnum = currItem->indexOffset; - Assert(itemIndex >= so->currPos.firstItem && - itemIndex <= so->currPos.lastItem); + Assert(itemIndex >= batch->firstItem && + itemIndex <= batch->lastItem); while (offnum <= maxoff) { @@ -613,9 +616,8 @@ _hash_kill_items(IndexScanDesc scan) MarkBufferDirtyHint(buf, true); } - if (so->hashso_bucket_buf == so->currPos.buf || - havePin) - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + if (!scan->dropPin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); } diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a22e6fc4b..7dc2ae9af 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1194,8 +1194,6 @@ HashPageStat HashPath HashScanOpaque HashScanOpaqueData -HashScanPosData -HashScanPosItem HashSkewBucket HashState HashValueFunc -- 2.51.0