From dc6d30723cefbe065e14812cb6368b58428ac24e Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 25 Nov 2025 18:03:15 -0500 Subject: [PATCH v12 23/23] Make hash index AM use amgetbatch interface. Replace hashgettuple with hashgetbatch, a function that implements the new amgetbatch interface. Plain index scans of hash indexes now return matching items in batches consisting of all of the matches from a given bucket or overflow page. This gives the core executor the ability to perform optimizations like index prefetching during hash index scans. Note that hash index scans will now drop index page buffer pins eagerly (actually, the table AM will do so on behalf of the hash index AM). This is a hard requirement for any index AM that adopts the new amgetbatch interface. Guaranteeing that open batches won't hold buffer pins on index pages greatly simplifies resource management during index prefetching, where the read stream is expected to hold many pins on heap pages (that's why amgetbatch makes this a hard requirement). Also add Valgrind buffer lock instrumentation to hash, bringing it in line with nbtree following commit 4a70f829. This is another requirement when using the amgetbatch interface. Author: Peter Geoghegan Reviewed-By: Tomas Vondra Discussion: https://postgr.es/m/CAH2-WzmYqhacBH161peAWb5eF=Ja7CFAQ+0jSEMq=qnfLVTOOg@mail.gmail.com --- src/include/access/hash.h | 84 ++----- src/backend/access/hash/README | 31 +-- src/backend/access/hash/hash.c | 178 ++++++++------ src/backend/access/hash/hash_xlog.c | 4 +- src/backend/access/hash/hashpage.c | 26 +-- src/backend/access/hash/hashsearch.c | 336 +++++++++++---------------- src/backend/access/hash/hashutil.c | 117 +--------- src/tools/pgindent/typedefs.list | 2 - 8 files changed, 301 insertions(+), 477 deletions(-) diff --git a/src/include/access/hash.h b/src/include/access/hash.h index a8702f0e5..5ea6ebd9f 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -100,57 +100,25 @@ typedef HashPageOpaqueData *HashPageOpaque; */ #define HASHO_PAGE_ID 0xFF80 -typedef struct HashScanPosItem /* what we remember about each match */ +/* + * Per-batch data private to the hash index AM. + * + * Stored at a negative offset from the IndexScanBatch pointer, in the + * index AM opaque area of each batch allocation. + */ +typedef struct HashBatchData { - ItemPointerData heapTid; /* TID of referenced heap item */ - OffsetNumber indexOffset; /* index item's location within page */ -} HashScanPosItem; + BlockNumber currPage; /* index page with matching items */ + BlockNumber prevPage; /* currPage's left link */ + BlockNumber nextPage; /* currPage's right link */ +} HashBatchData; -typedef struct HashScanPosData +/* Access the hash-private per-batch data from an IndexScanBatch pointer */ +static inline HashBatchData * +hash_batch_data(IndexScanBatch batch) { - Buffer buf; /* if valid, the buffer is pinned */ - BlockNumber currPage; /* current hash index page */ - BlockNumber nextPage; /* next overflow page */ - BlockNumber prevPage; /* prev overflow or bucket page */ - - /* - * The items array is always ordered in index order (ie, increasing - * indexoffset). When scanning backwards it is convenient to fill the - * array back-to-front, so we start at the last slot and fill downwards. - * Hence we need both a first-valid-entry and a last-valid-entry counter. - * itemIndex is a cursor showing which entry was last returned to caller. - */ - int firstItem; /* first valid index in items[] */ - int lastItem; /* last valid index in items[] */ - int itemIndex; /* current index in items[] */ - - HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ -} HashScanPosData; - -#define HashScanPosIsPinned(scanpos) \ -( \ - AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ - !BufferIsValid((scanpos).buf)), \ - BufferIsValid((scanpos).buf) \ -) - -#define HashScanPosIsValid(scanpos) \ -( \ - AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ - !BufferIsValid((scanpos).buf)), \ - BlockNumberIsValid((scanpos).currPage) \ -) - -#define HashScanPosInvalidate(scanpos) \ - do { \ - (scanpos).buf = InvalidBuffer; \ - (scanpos).currPage = InvalidBlockNumber; \ - (scanpos).nextPage = InvalidBlockNumber; \ - (scanpos).prevPage = InvalidBlockNumber; \ - (scanpos).firstItem = 0; \ - (scanpos).lastItem = 0; \ - (scanpos).itemIndex = 0; \ - } while (0) + return (HashBatchData *) ((char *) batch - MAXALIGN(sizeof(HashBatchData))); +} /* * HashScanOpaqueData is private state for a hash index scan. @@ -178,15 +146,6 @@ typedef struct HashScanOpaqueData * referred only when hashso_buc_populated is true. */ bool hashso_buc_split; - /* info about killed items if any (killedItems is NULL if never used) */ - int *killedItems; /* currPos.items indexes of killed items */ - int numKilled; /* number of currently stored items */ - - /* - * Identify all the matching items on a page and save them in - * HashScanPosData - */ - HashScanPosData currPos; /* current position data */ } HashScanOpaqueData; typedef HashScanOpaqueData *HashScanOpaque; @@ -368,11 +327,14 @@ extern bool hashinsert(Relation rel, Datum *values, bool *isnull, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); -extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); +extern IndexScanBatch hashgetbatch(IndexScanDesc scan, + IndexScanBatch priorbatch, + ScanDirection dir); extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); +extern void hashkillitemsbatch(IndexScanDesc scan, IndexScanBatch batch); extern void hashendscan(IndexScanDesc scan); extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, @@ -445,8 +407,9 @@ extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, uint32 lowmask); /* hashsearch.c */ -extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); -extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); +extern IndexScanBatch _hash_next(IndexScanDesc scan, ScanDirection dir, + IndexScanBatch priorbatch); +extern IndexScanBatch _hash_first(IndexScanDesc scan, ScanDirection dir); /* hashsort.c */ typedef struct HSpool HSpool; /* opaque struct in hashsort.c */ @@ -476,7 +439,6 @@ extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bu extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket); extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket); -extern void _hash_kill_items(IndexScanDesc scan); /* hash.c */ extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index fc9031117..972bb666b 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -255,28 +255,29 @@ The reader algorithm is: retake the buffer content lock on new bucket arrange to scan the old bucket normally and the new bucket for tuples which are not moved-by-split --- then, per read request: +-- then, per batch (page) request: reacquire content lock on current page step to next page if necessary (no chaining of content locks, but keep the pin on the primary bucket throughout the scan) - save all the matching tuples from current index page into an items array - release pin and content lock (but if it is primary bucket page retain - its pin till the end of the scan) - get tuple from an item array + save all the matching tuples from current index page into a batch + release content lock on current page return batch to table AM (table AM + will drop batch's buffer pin, though primary bucket page pin is kept + until the end of the scan) -- at scan shutdown: - release all pins still held + release scan-owned pins (e.g., primary bucket page pin) as needed Holding the buffer pin on the primary bucket page for the whole scan prevents -the reader's current-tuple pointer from being invalidated by splits or -compactions. (Of course, other buckets can still be split or compacted.) +the bucket from being reorganized by splits or compactions while the scan is +in progress. (Of course, other buckets can still be split or compacted.) -To minimize lock/unlock traffic, hash index scan always searches the entire -hash page to identify all the matching items at once, copying their heap tuple -IDs into backend-local storage. The heap tuple IDs are then processed while not -holding any page lock within the index thereby, allowing concurrent insertion -to happen on the same index page without any requirement of re-finding the -current scan position for the reader. We do continue to hold a pin on the -bucket page, to protect against concurrent deletions and bucket split. +To minimize lock/unlock traffic, hash index scans always search the entire +hash page to identify all the matching items at once, returning them in +batches to the table AM. The table AM processes batches while no page lock +is held within the index, allowing concurrent insertion to happen on the +same index page without any requirement of re-finding the current scan +position for the reader. The table AM controls when batch buffer pins are +dropped. We do continue to hold a pin on the primary bucket page, to +protect against concurrent bucket splits. To allow for scans during a bucket split, if at the start of the scan, the bucket is marked as bucket-being-populated, it scan all the tuples in that diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 92824aa5d..10524a22a 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -101,9 +101,9 @@ hashhandler(PG_FUNCTION_ARGS) .amadjustmembers = hashadjustmembers, .ambeginscan = hashbeginscan, .amrescan = hashrescan, - .amgettuple = hashgettuple, - .amgetbatch = NULL, - .amkillitemsbatch = NULL, + .amgettuple = NULL, + .amgetbatch = hashgetbatch, + .amkillitemsbatch = hashkillitemsbatch, .amgetbitmap = hashgetbitmap, .amendscan = hashendscan, .amposreset = NULL, @@ -286,53 +286,28 @@ hashinsert(Relation rel, Datum *values, bool *isnull, /* - * hashgettuple() -- Get the next tuple in the scan. + * hashgetbatch() -- Get the first or next batch of tuples in the scan */ -bool -hashgettuple(IndexScanDesc scan, ScanDirection dir) +IndexScanBatch +hashgetbatch(IndexScanDesc scan, IndexScanBatch priorbatch, ScanDirection dir) { HashScanOpaque so = (HashScanOpaque) scan->opaque; - bool res; /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; - /* - * If we've already initialized this scan, we can just advance it in the - * appropriate direction. If we haven't done so yet, we call a routine to - * get the first item in the scan. - */ - if (!HashScanPosIsValid(so->currPos)) - res = _hash_first(scan, dir); - else + if (priorbatch == NULL) { - /* - * Check to see if we should kill the previously-fetched tuple. - */ - if (scan->kill_prior_tuple) - { - /* - * Yes, so remember it for later. (We'll deal with all such tuples - * at once right after leaving the index page or at end of scan.) - * In case if caller reverses the indexscan direction it is quite - * possible that the same item might get entered multiple times. - * But, we don't detect that; instead, we just forget any excess - * entries. - */ - if (so->killedItems == NULL) - so->killedItems = palloc_array(int, MaxIndexTuplesPerPage); + Relation rel = scan->indexRelation; - if (so->numKilled < MaxIndexTuplesPerPage) - so->killedItems[so->numKilled++] = so->currPos.itemIndex; - } + _hash_dropscanbuf(rel, so); - /* - * Now continue the scan. - */ - res = _hash_next(scan, dir); + /* Initialize the scan, and return first batch of matching items */ + return _hash_first(scan, dir); } - return res; + /* Return batch positioned after caller's batch (in direction 'dir') */ + return _hash_next(scan, dir, priorbatch); } @@ -342,26 +317,26 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { - HashScanOpaque so = (HashScanOpaque) scan->opaque; - bool res; + IndexScanBatch batch; int64 ntids = 0; - HashScanPosItem *currItem; - res = _hash_first(scan, ForwardScanDirection); + batch = _hash_first(scan, ForwardScanDirection); - while (res) + while (batch != NULL) { - currItem = &so->currPos.items[so->currPos.itemIndex]; + for (int itemIndex = batch->firstItem; + itemIndex <= batch->lastItem; + itemIndex++) + { + tbm_add_tuples(tbm, &batch->items[itemIndex].tableTid, 1, true); + ntids++; + } /* - * _hash_first and _hash_next handle eliminate dead index entries - * whenever scan->ignore_killed_tuples is true. Therefore, there's - * nothing to do here except add the results to the TIDBitmap. + * _hash_next releases the prior batch for bitmap callers before + * allocating the next one, so only one batch is ever used at a time */ - tbm_add_tuples(tbm, &(currItem->heapTid), 1, true); - ntids++; - - res = _hash_next(scan, ForwardScanDirection); + batch = _hash_next(scan, ForwardScanDirection, batch); } return ntids; @@ -383,17 +358,16 @@ hashbeginscan(Relation rel, int nkeys, int norderbys) scan = RelationGetIndexScan(rel, nkeys, norderbys); so = (HashScanOpaque) palloc_object(HashScanOpaqueData); - HashScanPosInvalidate(so->currPos); so->hashso_bucket_buf = InvalidBuffer; so->hashso_split_bucket_buf = InvalidBuffer; so->hashso_buc_populated = false; so->hashso_buc_split = false; - so->killedItems = NULL; - so->numKilled = 0; - scan->opaque = so; + scan->maxitemsbatch = MaxIndexTuplesPerPage; + scan->batch_index_opaque_size = MAXALIGN(sizeof(HashBatchData)); + scan->batch_tuples_workspace = 0; return scan; } @@ -408,18 +382,8 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - if (HashScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _hash_kill_items(scan); - } - _hash_dropscanbuf(rel, so); - /* set position invalid (this will cause _hash_first call) */ - HashScanPosInvalidate(so->currPos); - /* Update scan key, if a new one is given */ if (scankey && scan->numberOfKeys > 0) memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); @@ -428,6 +392,81 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, so->hashso_buc_split = false; } +/* + * hashkillitemsbatch() -- Mark dead items' index tuples LP_DEAD + */ +void +hashkillitemsbatch(IndexScanDesc scan, IndexScanBatch batch) +{ + Relation rel = scan->indexRelation; + HashBatchData *hbatch = hash_batch_data(batch); + Buffer buf; + Page page; + HashPageOpaque opaque; + OffsetNumber offnum, + maxoff; + bool killedsomething = false; + XLogRecPtr latestlsn; + + Assert(batch->numDead > 0); + + buf = _hash_getbuf(rel, hbatch->currPage, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + + latestlsn = BufferGetLSNAtomic(buf); + Assert(batch->lsn <= latestlsn); + if (batch->lsn != latestlsn) + { + /* Modified, give up on hinting */ + _hash_relbuf(rel, buf); + return; + } + + page = BufferGetPage(buf); + opaque = HashPageGetOpaque(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* Iterate through batch->deadItems[] in index page order */ + for (int i = 0; i < batch->numDead; i++) + { + int itemIndex = batch->deadItems[i]; + BatchMatchingItem *currItem = &batch->items[itemIndex]; + + offnum = currItem->indexOffset; + + Assert(itemIndex >= batch->firstItem && + itemIndex <= batch->lastItem); + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerEquals(&ituple->t_tid, &currItem->tableTid)) + { + /* found the item */ + ItemIdMarkDead(iid); + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, mark as dirty hint. Whenever + * we mark anything LP_DEAD, we also set the page's + * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint. + */ + if (killedsomething) + { + opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; + MarkBufferDirtyHint(buf, true); + } + + _hash_relbuf(rel, buf); +} + /* * hashendscan() -- close down a scan */ @@ -437,17 +476,8 @@ hashendscan(IndexScanDesc scan) HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - if (HashScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _hash_kill_items(scan); - } - _hash_dropscanbuf(rel, so); - if (so->killedItems != NULL) - pfree(so->killedItems); pfree(so); scan->opaque = NULL; } diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index 2060620c7..e26ee8bb9 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -1141,14 +1141,14 @@ hash_mask(char *pagedata, BlockNumber blkno) /* * In hash bucket and overflow pages, it is possible to modify the * LP_FLAGS without emitting any WAL record. Hence, mask the line - * pointer flags. See hashgettuple(), _hash_kill_items() for details. + * pointer flags. See hashkillitemsbatch() for details. */ mask_lp_flags(page); } /* * It is possible that the hint bit LH_PAGE_HAS_DEAD_TUPLES may remain - * unlogged. So, mask it. See _hash_kill_items() for details. + * unlogged. So, mask it. See hashkillitemsbatch() for details. */ opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; } diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 263bc73f1..2bc8bea22 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -35,6 +35,7 @@ #include "port/pg_bitutils.h" #include "storage/predicate.h" #include "storage/smgr.h" +#include "utils/memdebug.h" #include "utils/rel.h" static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, @@ -79,6 +80,9 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags) if (access != HASH_NOLOCK) LockBuffer(buf, access); + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + /* ref count and lock type are correct */ _hash_checkpage(rel, buf, flags); @@ -108,6 +112,9 @@ _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags) return InvalidBuffer; } + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + /* ref count and lock type are correct */ _hash_checkpage(rel, buf, flags); @@ -280,31 +287,24 @@ _hash_dropbuf(Relation rel, Buffer buf) } /* - * _hash_dropscanbuf() -- release buffers used in scan. + * _hash_dropscanbuf() -- release buffers owned by scan. * - * This routine unpins the buffers used during scan on which we - * hold no lock. + * This routine unpins the buffers for the primary bucket page and for the + * bucket page of a bucket being split as needed. */ void _hash_dropscanbuf(Relation rel, HashScanOpaque so) { /* release pin we hold on primary bucket page */ - if (BufferIsValid(so->hashso_bucket_buf) && - so->hashso_bucket_buf != so->currPos.buf) + if (BufferIsValid(so->hashso_bucket_buf)) _hash_dropbuf(rel, so->hashso_bucket_buf); so->hashso_bucket_buf = InvalidBuffer; - /* release pin we hold on primary bucket page of bucket being split */ - if (BufferIsValid(so->hashso_split_bucket_buf) && - so->hashso_split_bucket_buf != so->currPos.buf) + /* release pin held on primary bucket page of bucket being split */ + if (BufferIsValid(so->hashso_split_bucket_buf)) _hash_dropbuf(rel, so->hashso_split_bucket_buf); so->hashso_split_bucket_buf = InvalidBuffer; - /* release any pin we still hold */ - if (BufferIsValid(so->currPos.buf)) - _hash_dropbuf(rel, so->currPos.buf); - so->currPos.buf = InvalidBuffer; - /* reset split scan */ so->hashso_buc_populated = false; so->hashso_buc_split = false; diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 89d1c5bc6..a878372f9 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -22,105 +22,87 @@ #include "storage/predicate.h" #include "utils/rel.h" -static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP, - ScanDirection dir); +static bool _hash_readpage(IndexScanDesc scan, Buffer buf, ScanDirection dir, + IndexScanBatch batch); static int _hash_load_qualified_items(IndexScanDesc scan, Page page, - OffsetNumber offnum, ScanDirection dir); -static inline void _hash_saveitem(HashScanOpaque so, int itemIndex, + OffsetNumber offnum, ScanDirection dir, + IndexScanBatch batch); +static inline void _hash_saveitem(IndexScanBatch batch, int itemIndex, OffsetNumber offnum, IndexTuple itup); static void _hash_readnext(IndexScanDesc scan, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep); /* - * _hash_next() -- Get the next item in a scan. + * _hash_next() -- Get the next batch of items in a scan. * - * On entry, so->currPos describes the current page, which may - * be pinned but not locked, and so->currPos.itemIndex identifies - * which item was previously returned. + * On entry, priorbatch describes the current page batch with items + * already returned. * - * On successful exit, scan->xs_heaptid is set to the TID of the next - * heap tuple. so->currPos is updated as needed. + * On successful exit, returns a batch containing matching items from + * next page. Otherwise returns NULL, indicating that there are no + * further matches. No locks are ever held when we return. * - * On failure exit (no more tuples), we return false with pin - * held on bucket page but no pins or locks held on overflow - * page. + * Retains pins according to the same rules as _hash_first. */ -bool -_hash_next(IndexScanDesc scan, ScanDirection dir) +IndexScanBatch +_hash_next(IndexScanDesc scan, ScanDirection dir, IndexScanBatch priorbatch) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; - HashScanPosItem *currItem; + HashBatchData *hpriorbatch = hash_batch_data(priorbatch); BlockNumber blkno; Buffer buf; - bool end_of_scan = false; + IndexScanBatch batch; /* - * Advance to the next tuple on the current page; or if done, try to read - * data from the next or previous page based on the scan direction. Before - * moving to the next or previous page make sure that we deal with all the - * killed items. + * Determine which page to read next based on scan direction and details + * taken from the prior batch */ if (ScanDirectionIsForward(dir)) - { - if (++so->currPos.itemIndex > so->currPos.lastItem) - { - if (so->numKilled > 0) - _hash_kill_items(scan); + blkno = hpriorbatch->nextPage; + else + blkno = hpriorbatch->prevPage; - blkno = so->currPos.nextPage; - if (BlockNumberIsValid(blkno)) - { - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); - if (!_hash_readpage(scan, &buf, dir)) - end_of_scan = true; - } - else - end_of_scan = true; - } - } + /* + * For bitmap scan callers, release the prior batch now so that the + * allocation below can reuse its memory. This way bitmap scans never + * need more than one batch allocation. + */ + if (!scan->usebatchring) + indexam_util_batch_release(scan, priorbatch); + + if (!BlockNumberIsValid(blkno)) + return NULL; + + /* Allocate space for next batch */ + batch = indexam_util_batch_alloc(scan); + + /* Get the buffer for next batch */ + if (ScanDirectionIsForward(dir)) + buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); else { - if (--so->currPos.itemIndex < so->currPos.firstItem) - { - if (so->numKilled > 0) - _hash_kill_items(scan); + buf = _hash_getbuf(rel, blkno, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); - blkno = so->currPos.prevPage; - if (BlockNumberIsValid(blkno)) - { - buf = _hash_getbuf(rel, blkno, HASH_READ, - LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); - - /* - * We always maintain the pin on bucket page for whole scan - * operation, so releasing the additional pin we have acquired - * here. - */ - if (buf == so->hashso_bucket_buf || - buf == so->hashso_split_bucket_buf) - _hash_dropbuf(rel, buf); - - if (!_hash_readpage(scan, &buf, dir)) - end_of_scan = true; - } - else - end_of_scan = true; - } + /* + * We always maintain the pin on bucket page for whole scan operation, + * so releasing the additional pin we have acquired here. + */ + if (buf == so->hashso_bucket_buf || + buf == so->hashso_split_bucket_buf) + _hash_dropbuf(rel, buf); } - if (end_of_scan) + /* Read the next page and load items into allocated batch */ + if (!_hash_readpage(scan, buf, dir, batch)) { - _hash_dropscanbuf(rel, so); - HashScanPosInvalidate(so->currPos); - return false; + indexam_util_batch_release(scan, batch); + return NULL; } - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_heaptid = currItem->heapTid; - - return true; + /* Return the batch containing matched items from next page */ + return batch; } /* @@ -270,22 +252,20 @@ _hash_readprev(IndexScanDesc scan, } /* - * _hash_first() -- Find the first item in a scan. + * _hash_first() -- Find the first batch of items in a scan. * - * We find the first item (or, if backward scan, the last item) in the - * index that satisfies the qualification associated with the scan - * descriptor. + * We find the first batch of items (or, if backward scan, the last + * batch) in the index that satisfies the qualification associated with + * the scan descriptor. * - * On successful exit, if the page containing current index tuple is an - * overflow page, both pin and lock are released whereas if it is a bucket - * page then it is pinned but not locked and data about the matching - * tuple(s) on the page has been loaded into so->currPos, - * scan->xs_heaptid is set to the heap TID of the current tuple. + * On successful exit, returns a batch containing matching items. + * Otherwise returns NULL, indicating that there are no further matches. + * No locks are ever held when we return. * - * On failure exit (no more tuples), we return false, with pin held on - * bucket page but no pins or locks held on overflow page. + * We always retain our own pin on the bucket page. When we return a + * batch with a bucket page, it will retain its own reference pin. */ -bool +IndexScanBatch _hash_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; @@ -296,7 +276,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) Buffer buf; Page page; HashPageOpaque opaque; - HashScanPosItem *currItem; + IndexScanBatch batch; pgstat_count_index_scan(rel); if (scan->instrument) @@ -326,7 +306,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) * items in the index. */ if (cur->sk_flags & SK_ISNULL) - return false; + return NULL; /* * Okay to compute the hash key. We want to do this before acquiring any @@ -419,191 +399,158 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) _hash_readnext(scan, &buf, &page, &opaque); } - /* remember which buffer we have pinned, if any */ - Assert(BufferIsInvalid(so->currPos.buf)); - so->currPos.buf = buf; + /* Allocate space for first batch */ + batch = indexam_util_batch_alloc(scan); - /* Now find all the tuples satisfying the qualification from a page */ - if (!_hash_readpage(scan, &buf, dir)) - return false; + /* Read the first page and load items into allocated batch */ + if (!_hash_readpage(scan, buf, dir, batch)) + { + indexam_util_batch_release(scan, batch); + return NULL; + } - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_heaptid = currItem->heapTid; - - /* if we're here, _hash_readpage found a valid tuples */ - return true; + /* Return the batch containing matched items */ + return batch; } /* - * _hash_readpage() -- Load data from current index page into so->currPos + * _hash_readpage() -- Load data from current index page into batch * * We scan all the items in the current index page and save them into - * so->currPos if it satisfies the qualification. If no matching items + * the batch if they satisfy the qualification. If no matching items * are found in the current page, we move to the next or previous page * in a bucket chain as indicated by the direction. * * Return true if any matching items are found else return false. */ static bool -_hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +_hash_readpage(IndexScanDesc scan, Buffer buf, ScanDirection dir, + IndexScanBatch batch) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; - Buffer buf; + HashBatchData *hbatch = hash_batch_data(batch); Page page; HashPageOpaque opaque; OffsetNumber offnum; uint16 itemIndex; - buf = *bufP; Assert(BufferIsValid(buf)); _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); opaque = HashPageGetOpaque(page); - so->currPos.buf = buf; - so->currPos.currPage = BufferGetBlockNumber(buf); + batch->buf = buf; + hbatch->currPage = BufferGetBlockNumber(buf); + batch->dir = dir; if (ScanDirectionIsForward(dir)) { - BlockNumber prev_blkno = InvalidBlockNumber; - for (;;) { /* new page, locate starting position by binary search */ offnum = _hash_binsearch(page, so->hashso_sk_hash); - itemIndex = _hash_load_qualified_items(scan, page, offnum, dir); + itemIndex = _hash_load_qualified_items(scan, page, offnum, dir, + batch); if (itemIndex != 0) break; /* - * Could not find any matching tuples in the current page, move to - * the next page. Before leaving the current page, deal with any - * killed items. + * Could not find any matching tuples in the current page, try to + * move to the next page */ - if (so->numKilled > 0) - _hash_kill_items(scan); - - /* - * If this is a primary bucket page, hasho_prevblkno is not a real - * block number. - */ - if (so->currPos.buf == so->hashso_bucket_buf || - so->currPos.buf == so->hashso_split_bucket_buf) - prev_blkno = InvalidBlockNumber; - else - prev_blkno = opaque->hasho_prevblkno; - _hash_readnext(scan, &buf, &page, &opaque); - if (BufferIsValid(buf)) + if (!BufferIsValid(buf)) { - so->currPos.buf = buf; - so->currPos.currPage = BufferGetBlockNumber(buf); - } - else - { - /* - * Remember next and previous block numbers for scrollable - * cursors to know the start position and return false - * indicating that no more matching tuples were found. Also, - * don't reset currPage or lsn, because we expect - * _hash_kill_items to be called for the old page after this - * function returns. - */ - so->currPos.prevPage = prev_blkno; - so->currPos.nextPage = InvalidBlockNumber; - so->currPos.buf = buf; + batch->buf = InvalidBuffer; return false; } + + batch->buf = buf; + hbatch->currPage = BufferGetBlockNumber(buf); } - so->currPos.firstItem = 0; - so->currPos.lastItem = itemIndex - 1; - so->currPos.itemIndex = 0; + batch->firstItem = 0; + batch->lastItem = itemIndex - 1; } else { - BlockNumber next_blkno = InvalidBlockNumber; - for (;;) { /* new page, locate starting position by binary search */ offnum = _hash_binsearch_last(page, so->hashso_sk_hash); - itemIndex = _hash_load_qualified_items(scan, page, offnum, dir); + itemIndex = _hash_load_qualified_items(scan, page, offnum, dir, + batch); if (itemIndex != MaxIndexTuplesPerPage) break; /* - * Could not find any matching tuples in the current page, move to - * the previous page. Before leaving the current page, deal with - * any killed items. + * Could not find any matching tuples in the current page, try to + * move to the previous page */ - if (so->numKilled > 0) - _hash_kill_items(scan); - - if (so->currPos.buf == so->hashso_bucket_buf || - so->currPos.buf == so->hashso_split_bucket_buf) - next_blkno = opaque->hasho_nextblkno; - _hash_readprev(scan, &buf, &page, &opaque); - if (BufferIsValid(buf)) + if (!BufferIsValid(buf)) { - so->currPos.buf = buf; - so->currPos.currPage = BufferGetBlockNumber(buf); - } - else - { - /* - * Remember next and previous block numbers for scrollable - * cursors to know the start position and return false - * indicating that no more matching tuples were found. Also, - * don't reset currPage or lsn, because we expect - * _hash_kill_items to be called for the old page after this - * function returns. - */ - so->currPos.prevPage = InvalidBlockNumber; - so->currPos.nextPage = next_blkno; - so->currPos.buf = buf; + batch->buf = InvalidBuffer; return false; } + + batch->buf = buf; + hbatch->currPage = BufferGetBlockNumber(buf); } - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + batch->firstItem = itemIndex; + batch->lastItem = MaxIndexTuplesPerPage - 1; } - if (so->currPos.buf == so->hashso_bucket_buf || - so->currPos.buf == so->hashso_split_bucket_buf) + /* + * Saved at least one match in batch.items[]. Prepare for hashgetbatch to + * return it by initializing remaining uninitialized fields. + */ + if (batch->buf == so->hashso_bucket_buf || + batch->buf == so->hashso_split_bucket_buf) { - so->currPos.prevPage = InvalidBlockNumber; - so->currPos.nextPage = opaque->hasho_nextblkno; - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + /* + * Batch's buffer is either the primary bucket, or a bucket being + * populated due to a split. + * + * Increment local reference count so that batch gets an independent + * buffer reference that can be released (by the core code/table AM) + * before the hashso_bucket_buf/hashso_split_bucket_buf references are + * released. + */ + IncrBufferRefCount(batch->buf); + + /* Can only use opaque->hasho_nextblkno */ + hbatch->prevPage = InvalidBlockNumber; + hbatch->nextPage = opaque->hasho_nextblkno; } else { - so->currPos.prevPage = opaque->hasho_prevblkno; - so->currPos.nextPage = opaque->hasho_nextblkno; - _hash_relbuf(rel, so->currPos.buf); - so->currPos.buf = InvalidBuffer; + /* Can use opaque->hasho_prevblkno and opaque->hasho_nextblkno */ + hbatch->prevPage = opaque->hasho_prevblkno; + hbatch->nextPage = opaque->hasho_nextblkno; } - Assert(so->currPos.firstItem <= so->currPos.lastItem); + /* we saved one or more matches in batch.items[] */ + indexam_util_batch_unlock(scan, batch); + + Assert(batch->firstItem <= batch->lastItem); return true; } /* * Load all the qualified items from a current index page - * into so->currPos. Helper function for _hash_readpage. + * into batch. Helper function for _hash_readpage. */ static int _hash_load_qualified_items(IndexScanDesc scan, Page page, - OffsetNumber offnum, ScanDirection dir) + OffsetNumber offnum, ScanDirection dir, + IndexScanBatch batch) { HashScanOpaque so = (HashScanOpaque) scan->opaque; IndexTuple itup; @@ -640,7 +587,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, _hash_checkqual(scan, itup)) { /* tuple is qualified, so remember it */ - _hash_saveitem(so, itemIndex, offnum, itup); + _hash_saveitem(batch, itemIndex, offnum, itup); itemIndex++; } else @@ -687,7 +634,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, { itemIndex--; /* tuple is qualified, so remember it */ - _hash_saveitem(so, itemIndex, offnum, itup); + _hash_saveitem(batch, itemIndex, offnum, itup); } else { @@ -706,13 +653,14 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, } } -/* Save an index item into so->currPos.items[itemIndex] */ +/* Save an index item into batch->items[itemIndex] */ static inline void -_hash_saveitem(HashScanOpaque so, int itemIndex, +_hash_saveitem(IndexScanBatch batch, int itemIndex, OffsetNumber offnum, IndexTuple itup) { - HashScanPosItem *currItem = &so->currPos.items[itemIndex]; + BatchMatchingItem *currItem = &batch->items[itemIndex]; - currItem->heapTid = itup->t_tid; + currItem->tableTid = itup->t_tid; currItem->indexOffset = offnum; + currItem->tupleOffset = 0; } diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index cf7f0b901..331d5f4da 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -16,7 +16,6 @@ #include "access/hash.h" #include "access/reloptions.h" -#include "access/relscan.h" #include "port/pg_bitutils.h" #include "utils/lsyscache.h" #include "utils/rel.h" @@ -33,7 +32,7 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup) /* * Currently, we can't check any of the scan conditions since we do not * have the original index entry value to supply to the sk_func. Always - * return true; we expect that hashgettuple already set the recheck flag + * return true; we expect that hashgetbatch already set the recheck flag * to make the main indexscan code do it. */ #ifdef NOT_USED @@ -505,117 +504,3 @@ _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, return new_bucket; } - -/* - * _hash_kill_items - set LP_DEAD state for items an indexscan caller has - * told us were killed. - * - * scan->opaque, referenced locally through so, contains information about the - * current page and killed tuples thereon (generally, this should only be - * called if so->numKilled > 0). - * - * The caller does not have a lock on the page and may or may not have the - * page pinned in a buffer. Note that read-lock is sufficient for setting - * LP_DEAD status (which is only a hint). - * - * The caller must have pin on bucket buffer, but may or may not have pin - * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos). - * - * We match items by heap TID before assuming they are the right ones to - * delete. - * - * There are never any scans active in a bucket at the time VACUUM begins, - * because VACUUM takes a cleanup lock on the primary bucket page and scans - * hold a pin. A scan can begin after VACUUM leaves the primary bucket page - * but before it finishes the entire bucket, but it can never pass VACUUM, - * because VACUUM always locks the next page before releasing the lock on - * the previous one. Therefore, we don't have to worry about accidentally - * killing a TID that has been reused for an unrelated tuple. - */ -void -_hash_kill_items(IndexScanDesc scan) -{ - HashScanOpaque so = (HashScanOpaque) scan->opaque; - Relation rel = scan->indexRelation; - BlockNumber blkno; - Buffer buf; - Page page; - HashPageOpaque opaque; - OffsetNumber offnum, - maxoff; - int numKilled = so->numKilled; - int i; - bool killedsomething = false; - bool havePin = false; - - Assert(so->numKilled > 0); - Assert(so->killedItems != NULL); - Assert(HashScanPosIsValid(so->currPos)); - - /* - * Always reset the scan state, so we don't look for same items on other - * pages. - */ - so->numKilled = 0; - - blkno = so->currPos.currPage; - if (HashScanPosIsPinned(so->currPos)) - { - /* - * We already have pin on this buffer, so, all we need to do is - * acquire lock on it. - */ - havePin = true; - buf = so->currPos.buf; - LockBuffer(buf, BUFFER_LOCK_SHARE); - } - else - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); - - page = BufferGetPage(buf); - opaque = HashPageGetOpaque(page); - maxoff = PageGetMaxOffsetNumber(page); - - for (i = 0; i < numKilled; i++) - { - int itemIndex = so->killedItems[i]; - HashScanPosItem *currItem = &so->currPos.items[itemIndex]; - - offnum = currItem->indexOffset; - - Assert(itemIndex >= so->currPos.firstItem && - itemIndex <= so->currPos.lastItem); - - while (offnum <= maxoff) - { - ItemId iid = PageGetItemId(page, offnum); - IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); - - if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid)) - { - /* found the item */ - ItemIdMarkDead(iid); - killedsomething = true; - break; /* out of inner search loop */ - } - offnum = OffsetNumberNext(offnum); - } - } - - /* - * Since this can be redone later if needed, mark as dirty hint. Whenever - * we mark anything LP_DEAD, we also set the page's - * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint. - */ - if (killedsomething) - { - opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; - MarkBufferDirtyHint(buf, true); - } - - if (so->hashso_bucket_buf == so->currPos.buf || - havePin) - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); - else - _hash_relbuf(rel, buf); -} diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index cb57bf71f..e54b6e2cc 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1205,8 +1205,6 @@ HashPageStat HashPath HashScanOpaque HashScanOpaqueData -HashScanPosData -HashScanPosItem HashSkewBucket HashState HashValueFunc -- 2.53.0