From 06b8550323f7f8048d1a4ae276080f7b33820501 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 9 Sep 2025 19:50:03 -0400 Subject: [PATCH v4 1/4] Add batching interfaces used by heapam and nbtree. Add a new amgetbatch index AM interface that allows index access methods to implement plain/ordered index scans that return index entries in per-leaf-page batches, rather than one at a time. This enables a variety of optimizations on the table AM side, most notably I/O prefetching of heap tuples during ordered index scans. It will also enable an optimization that has heapam avoid repeatedly locking and unlocking the same heap page's buffer. Index access methods that support plain index scans must now implement either the amgetbatch interface OR the amgettuple interface. The amgettuple interface will still be used by index AMs that require direct control over the progress of index scans (e.g., GiST with KNN ordered scans). This commit also adds a new table AM interface callback, called by the core executor through the new table_index_getnext_slot shim function. This allows the table AM to directly manage the progress of index scans rather than having individual TIDs passed in by the caller. The amgetbatch interface is tightly coupled with the new approach to ordered index scans added to the table AM. The table AM can apply knowledge of which TIDs will be returned to scan in the near future to optimize and batch table AM block accesses, and to perform I/O prefetching. These optimizations are left as work for later commits. Batches returned from amgetbatch are guaranteed to be associated with an index page containing at least one matching tuple. The amgetbatch interface may hold buffer pins as interlocks against concurrent TID recycling by VACUUM. This extends/generalizes the mechanism added to nbtree by commit 2ed5b87f to all index AMs that add support for the new amgetbatch interface. Author: Tomas Vondra Author: Peter Geoghegan Reviewed-By: Andres Freund Reviewed-By: Thomas Munro Discussion: https://postgr.es/m/cf85f46f-b02f-05b2-5248-5000b894ebab@enterprisedb.com Discussion: https://postgr.es/m/efac3238-6f34-41ea-a393-26cc0441b506%40vondra.me Discussion: https://postgr.es/m/CAH2-Wzk9%3Dx%3Da2TbcqYcX%2BXXmDHQr5%3D1v9m4Z_v8a-KwF1Zoz0A%40mail.gmail.com --- src/include/access/amapi.h | 22 +- src/include/access/genam.h | 27 +- src/include/access/heapam.h | 5 + src/include/access/nbtree.h | 176 +---- src/include/access/relscan.h | 244 +++++++ src/include/access/tableam.h | 57 +- src/include/nodes/execnodes.h | 2 - src/include/nodes/pathnodes.h | 2 +- src/backend/access/brin/brin.c | 5 +- src/backend/access/gin/ginget.c | 6 +- src/backend/access/gin/ginutil.c | 5 +- src/backend/access/gist/gist.c | 5 +- src/backend/access/hash/hash.c | 5 +- src/backend/access/heap/heapam_handler.c | 512 +++++++++++++- src/backend/access/index/Makefile | 3 +- src/backend/access/index/genam.c | 10 +- src/backend/access/index/indexam.c | 130 +--- src/backend/access/index/indexbatch.c | 640 ++++++++++++++++++ src/backend/access/index/meson.build | 1 + src/backend/access/nbtree/nbtpage.c | 3 + src/backend/access/nbtree/nbtreadpage.c | 196 +++--- src/backend/access/nbtree/nbtree.c | 306 ++------- src/backend/access/nbtree/nbtsearch.c | 511 +++++--------- src/backend/access/nbtree/nbtutils.c | 70 +- src/backend/access/spgist/spgutils.c | 5 +- src/backend/access/table/tableam.c | 2 +- src/backend/commands/constraint.c | 3 +- src/backend/commands/indexcmds.c | 2 +- src/backend/executor/execAmi.c | 2 +- src/backend/executor/execIndexing.c | 6 +- src/backend/executor/execReplication.c | 8 +- src/backend/executor/nodeIndexonlyscan.c | 100 +-- src/backend/executor/nodeIndexscan.c | 12 +- src/backend/optimizer/path/indxpath.c | 2 +- src/backend/optimizer/util/plancat.c | 6 +- src/backend/replication/logical/relation.c | 3 +- src/backend/utils/adt/amutils.c | 4 +- src/backend/utils/adt/selfuncs.c | 57 +- contrib/bloom/blutils.c | 3 +- doc/src/sgml/indexam.sgml | 310 +++++++-- doc/src/sgml/ref/create_table.sgml | 13 +- .../modules/dummy_index_am/dummy_index_am.c | 4 +- src/tools/pgindent/typedefs.list | 4 - 43 files changed, 2295 insertions(+), 1194 deletions(-) create mode 100644 src/backend/access/index/indexbatch.c diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 63dd41c1f..a7eb33ce9 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -198,6 +198,15 @@ typedef void (*amrescan_function) (IndexScanDesc scan, typedef bool (*amgettuple_function) (IndexScanDesc scan, ScanDirection direction); +/* next batch of valid tuples */ +typedef BatchIndexScan (*amgetbatch_function) (IndexScanDesc scan, + BatchIndexScan priorbatch, + ScanDirection direction); + +/* release batch of valid tuples */ +typedef void (*amfreebatch_function) (IndexScanDesc scan, + BatchIndexScan batch); + /* fetch all valid tuples */ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, TIDBitmap *tbm); @@ -205,11 +214,9 @@ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, /* end index scan */ typedef void (*amendscan_function) (IndexScanDesc scan); -/* mark current scan position */ -typedef void (*ammarkpos_function) (IndexScanDesc scan); - -/* restore marked scan position */ -typedef void (*amrestrpos_function) (IndexScanDesc scan); +/* invalidate index AM state that independently tracks scan's position */ +typedef void (*amposreset_function) (IndexScanDesc scan, + BatchIndexScan batch); /* * Callback function signatures - for parallel index scans. @@ -309,10 +316,11 @@ typedef struct IndexAmRoutine ambeginscan_function ambeginscan; amrescan_function amrescan; amgettuple_function amgettuple; /* can be NULL */ + amgetbatch_function amgetbatch; /* can be NULL */ + amfreebatch_function amfreebatch; /* can be NULL */ amgetbitmap_function amgetbitmap; /* can be NULL */ amendscan_function amendscan; - ammarkpos_function ammarkpos; /* can be NULL */ - amrestrpos_function amrestrpos; /* can be NULL */ + amposreset_function amposreset; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 9200a22bd..f1984e700 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -115,6 +115,7 @@ typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state); /* struct definitions appear in relscan.h */ typedef struct IndexScanDescData *IndexScanDesc; +typedef struct BatchIndexScanData *BatchIndexScan; typedef struct SysScanDescData *SysScanDesc; typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc; @@ -175,6 +176,7 @@ extern void index_insert_cleanup(Relation indexRelation, extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, + TupleTableSlot *ios_tableslot, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys); @@ -201,14 +203,12 @@ extern void index_parallelscan_initialize(Relation heapRelation, extern void index_parallelrescan(IndexScanDesc scan); extern IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, + TupleTableSlot *ios_tableslot, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); -extern bool index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot); -extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, - TupleTableSlot *slot); extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap); extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, @@ -272,4 +272,25 @@ extern void systable_inplace_update_begin(Relation relation, extern void systable_inplace_update_finish(void *state, HeapTuple tuple); extern void systable_inplace_update_cancel(void *state); +/* + * amgetbatch utilities called by indexam.c (in indexbatch.c) + */ +struct BatchQueueItemPos; +extern void index_batch_init(IndexScanDesc scan); +extern bool batch_getnext(IndexScanDesc scan, ScanDirection direction); +extern void batch_free(IndexScanDesc scan, BatchIndexScan batch); +extern void index_batch_reset(IndexScanDesc scan, bool complete); +extern void index_batch_mark_pos(IndexScanDesc scan); +extern void index_batch_restore_pos(IndexScanDesc scan); +extern void index_batch_kill_item(IndexScanDesc scan); +extern void index_batch_end(IndexScanDesc scan); + +/* + * amgetbatch utilities called by index AMs (in indexbatch.c) + */ +extern void indexam_util_batch_unlock(IndexScanDesc scan, BatchIndexScan batch); +extern BatchIndexScan indexam_util_batch_alloc(IndexScanDesc scan, + int maxitems, bool want_itup); +extern void indexam_util_batch_release(IndexScanDesc scan, BatchIndexScan batch); + #endif /* GENAM_H */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 632c4332a..e8d347e47 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -118,6 +118,11 @@ typedef struct IndexFetchHeapData Buffer xs_cbuf; /* current heap buffer in scan, if any */ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + + Buffer vmbuf; /* visibility map buffer */ + TupleTableSlot *ios_tableslot; /* transient slot for fetching tuples to + * check visibility during index-only + * scans */ } IndexFetchHeapData; /* Result codes for HeapTupleSatisfiesVacuum */ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 7a3efd209..658c46a1f 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -924,112 +924,6 @@ typedef struct BTVacuumPostingData typedef BTVacuumPostingData *BTVacuumPosting; -/* - * BTScanOpaqueData is the btree-private state needed for an indexscan. - * This consists of preprocessed scan keys (see _bt_preprocess_keys() for - * details of the preprocessing), information about the current location - * of the scan, and information about the marked location, if any. (We use - * BTScanPosData to represent the data needed for each of current and marked - * locations.) In addition we can remember some known-killed index entries - * that must be marked before we can move off the current page. - * - * Index scans work a page at a time: we pin and read-lock the page, identify - * all the matching items on the page and save them in BTScanPosData, then - * release the read-lock while returning the items to the caller for - * processing. This approach minimizes lock/unlock traffic. We must always - * drop the lock to make it okay for caller to process the returned items. - * Whether or not we can also release the pin during this window will vary. - * We drop the pin (when so->dropPin) to avoid blocking progress by VACUUM - * (see nbtree/README section about making concurrent TID recycling safe). - * We'll always release both the lock and the pin on the current page before - * moving on to its sibling page. - * - * If we are doing an index-only scan, we save the entire IndexTuple for each - * matched item, otherwise only its heap TID and offset. The IndexTuples go - * into a separate workspace array; each BTScanPosItem stores its tuple's - * offset within that array. Posting list tuples store a "base" tuple once, - * allowing the same key to be returned for each TID in the posting list - * tuple. - */ - -typedef struct BTScanPosItem /* what we remember about each match */ -{ - ItemPointerData heapTid; /* TID of referenced heap item */ - OffsetNumber indexOffset; /* index item's location within page */ - LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ -} BTScanPosItem; - -typedef struct BTScanPosData -{ - Buffer buf; /* currPage buf (invalid means unpinned) */ - - /* page details as of the saved position's call to _bt_readpage */ - BlockNumber currPage; /* page referenced by items array */ - BlockNumber prevPage; /* currPage's left link */ - BlockNumber nextPage; /* currPage's right link */ - XLogRecPtr lsn; /* currPage's LSN (when so->dropPin) */ - - /* scan direction for the saved position's call to _bt_readpage */ - ScanDirection dir; - - /* - * If we are doing an index-only scan, nextTupleOffset is the first free - * location in the associated tuple storage workspace. - */ - int nextTupleOffset; - - /* - * moreLeft and moreRight track whether we think there may be matching - * index entries to the left and right of the current page, respectively. - */ - bool moreLeft; - bool moreRight; - - /* - * The items array is always ordered in index order (ie, increasing - * indexoffset). When scanning backwards it is convenient to fill the - * array back-to-front, so we start at the last slot and fill downwards. - * Hence we need both a first-valid-entry and a last-valid-entry counter. - * itemIndex is a cursor showing which entry was last returned to caller. - */ - int firstItem; /* first valid index in items[] */ - int lastItem; /* last valid index in items[] */ - int itemIndex; /* current index in items[] */ - - BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */ -} BTScanPosData; - -typedef BTScanPosData *BTScanPos; - -#define BTScanPosIsPinned(scanpos) \ -( \ - AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ - !BufferIsValid((scanpos).buf)), \ - BufferIsValid((scanpos).buf) \ -) -#define BTScanPosUnpin(scanpos) \ - do { \ - ReleaseBuffer((scanpos).buf); \ - (scanpos).buf = InvalidBuffer; \ - } while (0) -#define BTScanPosUnpinIfPinned(scanpos) \ - do { \ - if (BTScanPosIsPinned(scanpos)) \ - BTScanPosUnpin(scanpos); \ - } while (0) - -#define BTScanPosIsValid(scanpos) \ -( \ - AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ - !BufferIsValid((scanpos).buf)), \ - BlockNumberIsValid((scanpos).currPage) \ -) -#define BTScanPosInvalidate(scanpos) \ - do { \ - (scanpos).buf = InvalidBuffer; \ - (scanpos).currPage = InvalidBlockNumber; \ - } while (0) - /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ typedef struct BTArrayKeyInfo { @@ -1050,6 +944,30 @@ typedef struct BTArrayKeyInfo ScanKey high_compare; /* array's < or <= upper bound */ } BTArrayKeyInfo; +/* + * BTScanOpaqueData is the btree-private state needed for an indexscan. + * This consists of preprocessed scan keys (see _bt_preprocess_keys() for + * details of the preprocessing), and information about the current array + * keys. There are assumptions about how the current array keys track the + * progress of the index scan through the index's key space (see _bt_readpage + * and _bt_advance_array_keys), but we don't actually track anything about the + * current scan position in this opaque struct. That is tracked externally, + * by implementing a queue of "batches", where each batch represents the items + * returned by btgetbatch within a single leaf page. + * + * Index scans work a page at a time, as required by the amgetbatch contract: + * we pin and read-lock the page, identify all the matching items on the page + * and return them in a newly allocated batch. We then release the read-lock + * using amgetbatch utility routines. This approach minimizes lock/unlock + * traffic. _bt_next is passed priorbatch, which contains details of which + * page is next in line to be read (priorbatch is provided as an argument to + * btgetbatch by core code). + * + * If we are doing an index-only scan, we save the entire IndexTuple for each + * matched item, otherwise only its heap TID and offset. This is also per the + * amgetbatch contract. Posting list tuples store a "base" tuple once, + * allowing the same key to be returned for each TID in the posting list. + */ typedef struct BTScanOpaqueData { /* these fields are set by _bt_preprocess_keys(): */ @@ -1066,32 +984,6 @@ typedef struct BTScanOpaqueData BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ FmgrInfo *orderProcs; /* ORDER procs for required equality keys */ MemoryContext arrayContext; /* scan-lifespan context for array data */ - - /* info about killed items if any (killedItems is NULL if never used) */ - int *killedItems; /* currPos.items indexes of killed items */ - int numKilled; /* number of currently stored items */ - bool dropPin; /* drop leaf pin before btgettuple returns? */ - - /* - * If we are doing an index-only scan, these are the tuple storage - * workspaces for the currPos and markPos respectively. Each is of size - * BLCKSZ, so it can hold as much as a full page's worth of tuples. - */ - char *currTuples; /* tuple storage for currPos */ - char *markTuples; /* tuple storage for markPos */ - - /* - * If the marked position is on the same page as current position, we - * don't use markPos, but just keep the marked itemIndex in markItemIndex - * (all the rest of currPos is valid for the mark position). Hence, to - * determine if there is a mark, first look at markItemIndex, then at - * markPos. - */ - int markItemIndex; /* itemIndex, or -1 if not valid */ - - /* keep these last in struct for efficiency */ - BTScanPosData currPos; /* current position data */ - BTScanPosData markPos; /* marked position, if any */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -1160,14 +1052,16 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull, extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); extern Size btestimateparallelscan(Relation rel, int nkeys, int norderbys); extern void btinitparallelscan(void *target); -extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); +extern BatchIndexScan btgetbatch(IndexScanDesc scan, + BatchIndexScan priorbatch, + ScanDirection dir); extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); +extern void btfreebatch(IndexScanDesc scan, BatchIndexScan batch); extern void btparallelrescan(IndexScanDesc scan); extern void btendscan(IndexScanDesc scan); -extern void btmarkpos(IndexScanDesc scan); -extern void btrestrpos(IndexScanDesc scan); +extern void btposreset(IndexScanDesc scan, BatchIndexScan markbatch); extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, @@ -1271,8 +1165,9 @@ extern void _bt_preprocess_keys(IndexScanDesc scan); /* * prototypes for functions in nbtreadpage.c */ -extern bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, - OffsetNumber offnum, bool firstpage); +extern bool _bt_readpage(IndexScanDesc scan, BatchIndexScan newbatch, + ScanDirection dir, OffsetNumber offnum, + bool firstpage); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); extern int _bt_binsrch_array_skey(FmgrInfo *orderproc, bool cur_elem_trig, ScanDirection dir, @@ -1287,8 +1182,9 @@ extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access); extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); -extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); -extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); +extern BatchIndexScan _bt_first(IndexScanDesc scan, ScanDirection dir); +extern BatchIndexScan _bt_next(IndexScanDesc scan, ScanDirection dir, + BatchIndexScan priorbatch); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); /* @@ -1296,7 +1192,7 @@ extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); */ extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); -extern void _bt_killitems(IndexScanDesc scan); +extern void _bt_killitems(IndexScanDesc scan, BatchIndexScan batch); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); extern void _bt_end_vacuum(Relation rel); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 87a8be104..3a416e76b 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -16,6 +16,7 @@ #include "access/htup_details.h" #include "access/itup.h" +#include "access/sdir.h" #include "nodes/tidbitmap.h" #include "port/atomics.h" #include "storage/buf.h" @@ -123,8 +124,192 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + + int nheapaccesses; /* number of heap accesses, for + * instrumentation/metrics */ } IndexFetchTableData; +/* + * Queue-wise location of a BatchMatchingItem that appears in a BatchIndexScan + * returned by (and subsequently passed to) an amgetbatch routine + */ +typedef struct BatchQueueItemPos +{ + /* BatchQueue.batches[]-wise index to relevant BatchIndexScan */ + int batch; + + /* BatchIndexScan.items[]-wise index to relevant BatchMatchingItem */ + int item; +} BatchQueueItemPos; + +static inline void +batch_reset_pos(BatchQueueItemPos *pos) +{ + pos->batch = -1; + pos->item = -1; +} + +/* + * Matching item returned by amgetbatch (in returned BatchIndexScan) during an + * index scan. Used by table AM to locate relevant matching table tuple. + */ +typedef struct BatchMatchingItem +{ + ItemPointerData heapTid; /* TID of referenced heap item */ + OffsetNumber indexOffset; /* index item's location within page */ + LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ +} BatchMatchingItem; + +/* + * Data about one batch of items returned by (and passed to) amgetbatch during + * index scans + */ +typedef struct BatchIndexScanData +{ + /* + * Information output by amgetbatch index AMs upon returning a batch with + * one or more matching items, describing details of the index page where + * matches were located. + * + * Used in the next amgetbatch call to determine which index page to read + * next (or to determine if there's no further matches in current scan + * direction). + */ + BlockNumber currPage; /* Index page with matching items */ + BlockNumber prevPage; /* currPage's left link */ + BlockNumber nextPage; /* currPage's right link */ + + Buffer buf; /* currPage buf (invalid means unpinned) */ + XLogRecPtr lsn; /* currPage's LSN (when dropPin) */ + + /* scan direction when the index page was read */ + ScanDirection dir; + + /* + * moreLeft and moreRight track whether we think there may be matching + * index entries to the left and right of the current page, respectively + */ + bool moreLeft; + bool moreRight; + + /* + * The items array is always ordered in index order (ie, increasing + * indexoffset). When scanning backwards it is convenient to fill the + * array back-to-front, so we start at the last slot and fill downwards. + * Hence we need both a first-valid-entry and a last-valid-entry counter. + */ + int firstItem; /* first valid index in items[] */ + int lastItem; /* last valid index in items[] */ + + /* info about killed items if any (killedItems is NULL if never used) */ + int *killedItems; /* indexes of killed items */ + int numKilled; /* number of currently stored items */ + + /* + * Matching items state for this batch. + * + * If we are doing an index-only scan, these are the tuple storage + * workspaces for the matching tuples (tuples referenced by items[]). Each + * is of size BLCKSZ, so it can hold as much as a full page's worth of + * tuples. + */ + char *currTuples; /* tuple storage for items[] */ + int maxitems; /* allocated size of items[] */ + BatchMatchingItem items[FLEXIBLE_ARRAY_MEMBER]; +} BatchIndexScanData; + +typedef struct BatchIndexScanData *BatchIndexScan; + +/* + * Maximum number of batches (leaf pages) we can keep in memory. We need a + * minimum of two, since we'll only consider releasing one batch when another + * is read. + */ +#define INDEX_SCAN_MAX_BATCHES 2 +#define INDEX_SCAN_CACHE_BATCHES 2 +#define INDEX_SCAN_BATCH_COUNT(scan) \ + ((scan)->batchqueue->nextBatch - (scan)->batchqueue->headBatch) + +/* Did we already load batch with the requested index? */ +#define INDEX_SCAN_BATCH_LOADED(scan, idx) \ + ((idx) < (scan)->batchqueue->nextBatch) + +/* Have we loaded the maximum number of batches? */ +#define INDEX_SCAN_BATCH_FULL(scan) \ + (INDEX_SCAN_BATCH_COUNT(scan) == INDEX_SCAN_MAX_BATCHES) + +/* Return batch for the provided index. */ +#define INDEX_SCAN_BATCH(scan, idx) \ + ((scan)->batchqueue->batches[(idx) % INDEX_SCAN_MAX_BATCHES]) + +/* Is the position invalid/undefined? */ +#define INDEX_SCAN_POS_INVALID(pos) \ + (((pos)->batch == -1) && ((pos)->item == -1)) + +#ifdef INDEXAM_DEBUG +#define DEBUG_LOG(...) elog(AmRegularBackendProcess() ? NOTICE : DEBUG2, __VA_ARGS__) +#else +#define DEBUG_LOG(...) +#endif + +/* + * State used by table AMs to manage an index scan that uses the amgetbatch + * interface. Scans work with a queue of batches returned by amgetbatch. + * + * Batches are kept in the order that they were returned in by amgetbatch, + * since that is the same order that table_index_getnext_slot will return + * matches in. However, table AMs are free to fetch table tuples in whatever + * order is most convenient/efficient -- provided that such reordering cannot + * affect the order that table_index_getnext_slot later returns tuples in. + * + * This data structure also provides table AMs with a way to read ahead of the + * current read position by _multiple_ batches/index pages. The further out + * the table AM reads ahead like this, the further it can see into the future. + * That way the table AM is able to reorder work as aggressively as desired. + * For example, index scans sometimes need to readahead by as many as a few + * dozen amgetbatch batches in order to maintain an optimal I/O prefetch + * distance (distance for reading table blocks/fetching table tuples). + */ +typedef struct BatchQueue +{ + /* amgetbatch can safely drop pins on returned batch's index page? */ + bool dropPin; + + /* + * Did we read the final batch in this scan direction? The batches may be + * loaded from multiple places, and we need to remember when we fail to + * load the next batch in a given scan (which means "no more batches"). + * amgetbatch may restart the scan on the get call, so we need to remember + * it's over. + */ + bool finished; + + /* Current scan direction, for the currently loaded batches */ + ScanDirection direction; + + /* current positions in batches[] for scan */ + BatchQueueItemPos readPos; /* read position */ + BatchQueueItemPos markPos; /* mark/restore position */ + + BatchIndexScan markBatch; + + /* + * Array of batches returned by the AM. The array has a capacity (but can + * be resized if needed). The headBatch is an index of the batch we're + * currently reading from (this needs to be translated by modulo + * INDEX_SCAN_MAX_BATCHES into index in the batches array). + */ + int headBatch; /* head batch slot */ + int nextBatch; /* next empty batch slot */ + + /* Array of pointers to cached recyclable batches */ + BatchIndexScan cache[INDEX_SCAN_CACHE_BATCHES]; + + /* Array of pointers to queued batches */ + BatchIndexScan batches[INDEX_SCAN_MAX_BATCHES]; + +} BatchQueue; + struct IndexScanInstrumentation; /* @@ -140,6 +325,8 @@ typedef struct IndexScanDescData struct SnapshotData *xs_snapshot; /* snapshot to see */ int numberOfKeys; /* number of index qualifier conditions */ int numberOfOrderBys; /* number of ordering operators */ + BatchQueue *batchqueue; /* amgetbatch related state */ + struct ScanKeyData *keyData; /* array of index qualifier descriptors */ struct ScanKeyData *orderByData; /* array of ordering op descriptors */ bool xs_want_itup; /* caller requests index tuples */ @@ -216,4 +403,61 @@ typedef struct SysScanDescData struct TupleTableSlot *slot; } SysScanDescData; +/* + * Check that a position (batch,item) is valid with respect to the batches we + * have currently loaded. + */ +static inline void +batch_assert_pos_valid(IndexScanDescData *scan, BatchQueueItemPos *pos) +{ +#ifdef USE_ASSERT_CHECKING + BatchQueue *batchqueue = scan->batchqueue; + + /* make sure the position is valid for currently loaded batches */ + Assert(pos->batch >= batchqueue->headBatch); + Assert(pos->batch < batchqueue->nextBatch); +#endif +} + +/* + * Check a single batch is valid. + */ +static inline void +batch_assert_batch_valid(IndexScanDescData *scan, BatchIndexScan batch) +{ + /* batch must have one or more matching items returned by index AM */ + Assert(batch->firstItem >= 0 && batch->firstItem <= batch->lastItem); + Assert(batch->items != NULL); + + /* + * The number of killed items must be valid, and there must be an array of + * indexes if there are items. + */ + Assert(batch->numKilled >= 0); + Assert(!(batch->numKilled > 0 && batch->killedItems == NULL)); +} + +static inline void +batch_assert_batches_valid(IndexScanDescData *scan) +{ +#ifdef USE_ASSERT_CHECKING + BatchQueue *batchqueue = scan->batchqueue; + + /* we should have batches initialized */ + Assert(batchqueue != NULL); + + /* The head/next indexes should define a valid range */ + Assert(batchqueue->headBatch >= 0 && + batchqueue->headBatch <= batchqueue->nextBatch); + + /* Check all current batches */ + for (int i = batchqueue->headBatch; i < batchqueue->nextBatch; i++) + { + BatchIndexScan batch = INDEX_SCAN_BATCH(scan, i); + + batch_assert_batch_valid(scan, batch); + } +#endif +} + #endif /* RELSCAN_H */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 2fa790b6b..3e98d8537 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -420,7 +420,8 @@ typedef struct TableAmRoutine * * Tuples for an index scan can then be fetched via index_fetch_tuple. */ - struct IndexFetchTableData *(*index_fetch_begin) (Relation rel); + struct IndexFetchTableData *(*index_fetch_begin) (Relation rel, + TupleTableSlot *ios_tableslot); /* * Reset index fetch. Typically this will release cross index fetch @@ -433,11 +434,34 @@ typedef struct TableAmRoutine */ void (*index_fetch_end) (struct IndexFetchTableData *data); + /* + * Fetch the next tuple from an index scan into slot, scanning in the + * specified direction, and return true if a tuple was found, false + * otherwise. + * + * This callback allows the table AM to directly manage the scan process, + * including interfacing with the index AM. The caller simply specifies + * the direction of the scan; the table AM takes care of retrieving TIDs + * from the index, performing visibility checks, and returning tuples in + * the slot. This enables important optimizations (such as table block I/O + * prefetching) that require that the table AM directly manages the + * progress of the index scan. + * + * Table AMs that implement this are expected to use batch_getnext (and + * other batch utility routines) to perform amgetbatch index scans. + */ + bool (*index_getnext_slot) (IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot); + /* * Fetch tuple at `tid` into `slot`, after doing a visibility test * according to `snapshot`. If a tuple was found and passed the visibility * test, return true, false otherwise. * + * This is a lower-level callback that takes a TID from the caller. + * Callers should favor the index_getnext_slot callback whenever possible. + * * Note that AMs that do not necessarily update indexes when indexed * columns do not change, need to return the current/correct version of * the tuple that is visible to the snapshot, even if the tid points to an @@ -459,7 +483,6 @@ typedef struct TableAmRoutine TupleTableSlot *slot, bool *call_again, bool *all_dead); - /* ------------------------------------------------------------------------ * Callbacks for non-modifying operations on individual tuples * ------------------------------------------------------------------------ @@ -1159,14 +1182,15 @@ table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) /* * Prepare to fetch tuples from the relation, as needed when fetching tuples - * for an index scan. + * for an index scan. Index-only scan callers must provide ios_tableslot, + * which is a slot for holding tuples fetched from the table. * * Tuples for an index scan can then be fetched via table_index_fetch_tuple(). */ static inline IndexFetchTableData * -table_index_fetch_begin(Relation rel) +table_index_fetch_begin(Relation rel, TupleTableSlot *ios_tableslot) { - return rel->rd_tableam->index_fetch_begin(rel); + return rel->rd_tableam->index_fetch_begin(rel, ios_tableslot); } /* @@ -1188,6 +1212,26 @@ table_index_fetch_end(struct IndexFetchTableData *scan) scan->rel->rd_tableam->index_fetch_end(scan); } +/* + * Fetch the next tuple from an index scan into `slot`, scanning in the + * specified direction. Returns true if a tuple was found, false otherwise. + * + * The index scan should have been started via table_index_fetch_begin(). + * Callers must check scan->xs_recheck and recheck scan keys if required. + * + * Index-only scan callers must pass an index scan descriptor that was created + * by passing a valid ios_tableslot to index_beginscan. This ios_tableslot + * will be passed down to table_index_fetch_begin by index_beginscan. + */ +static inline bool +table_index_getnext_slot(IndexScanDesc idxscan, ScanDirection direction, + TupleTableSlot *slot) +{ + struct IndexFetchTableData *scan = idxscan->xs_heapfetch; + + return scan->rel->rd_tableam->index_getnext_slot(idxscan, direction, slot); +} + /* * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing * a visibility test according to `snapshot`. If a tuple was found and passed @@ -1211,6 +1255,9 @@ table_index_fetch_end(struct IndexFetchTableData *scan) * entry (like heap's HOT). Whereas table_tuple_fetch_row_version() only * evaluates the tuple exactly at `tid`. Outside of index entry ->table tuple * lookups, table_tuple_fetch_row_version() is what's usually needed. + * + * This is a lower-level interface that takes a TID from the caller. Callers + * should favor the table_index_getnext_slot interface whenever possible. */ static inline bool table_index_fetch_tuple(struct IndexFetchTableData *scan, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 64ff69964..b6064bb3d 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1751,7 +1751,6 @@ typedef struct IndexScanState * Instrument local index scan instrumentation * SharedInfo parallel worker instrumentation (no leader entry) * TableSlot slot for holding tuples fetched from the table - * VMBuffer buffer in use for visibility map testing, if any * PscanLen size of parallel index-only scan descriptor * NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN * NameCStringCount number of elements in the NameCStringAttNums array @@ -1774,7 +1773,6 @@ typedef struct IndexOnlyScanState IndexScanInstrumentation ioss_Instrument; SharedIndexScanInstrumentation *ioss_SharedInfo; TupleTableSlot *ioss_TableSlot; - Buffer ioss_VMBuffer; Size ioss_PscanLen; AttrNumber *ioss_NameCStringAttNums; int ioss_NameCStringCount; diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 46a865562..11805bff9 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1344,7 +1344,7 @@ typedef struct IndexOptInfo /* does AM have amgetbitmap interface? */ bool amhasgetbitmap; bool amcanparallel; - /* does AM have ammarkpos interface? */ + /* is AM prepared for us to restore a mark? */ bool amcanmarkpos; /* AM's cost estimator */ /* Rather than include amapi.h here, we declare amcostestimate like this */ diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 26cb75058..880921961 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -294,10 +294,11 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->ambeginscan = brinbeginscan; amroutine->amrescan = brinrescan; amroutine->amgettuple = NULL; + amroutine->amgetbatch = NULL; + amroutine->amfreebatch = NULL; amroutine->amgetbitmap = bringetbitmap; amroutine->amendscan = brinendscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; + amroutine->amposreset = NULL; amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index b3e2e9d5f..b8f831a31 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -1953,9 +1953,9 @@ gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm) * into the main index, and so we might visit it a second time during the * main scan. This is okay because we'll just re-set the same bit in the * bitmap. (The possibility of duplicate visits is a major reason why GIN - * can't support the amgettuple API, however.) Note that it would not do - * to scan the main index before the pending list, since concurrent - * cleanup could then make us miss entries entirely. + * can't support either the amgettuple or amgetbatch API.) Note that it + * would not do to scan the main index before the pending list, since + * concurrent cleanup could then make us miss entries entirely. */ scanPendingInsert(scan, tbm, &ntids); diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 605f80aad..1d233087e 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -82,10 +82,11 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->ambeginscan = ginbeginscan; amroutine->amrescan = ginrescan; amroutine->amgettuple = NULL; + amroutine->amgetbatch = NULL; + amroutine->amfreebatch = NULL; amroutine->amgetbitmap = gingetbitmap; amroutine->amendscan = ginendscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; + amroutine->amposreset = NULL; amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index c26d8538f..0d282c69a 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -103,10 +103,11 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->ambeginscan = gistbeginscan; amroutine->amrescan = gistrescan; amroutine->amgettuple = gistgettuple; + amroutine->amgetbatch = NULL; + amroutine->amfreebatch = NULL; amroutine->amgetbitmap = gistgetbitmap; amroutine->amendscan = gistendscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; + amroutine->amposreset = NULL; amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index e388252af..7289df574 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -102,10 +102,11 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambeginscan = hashbeginscan; amroutine->amrescan = hashrescan; amroutine->amgettuple = hashgettuple; + amroutine->amgetbatch = NULL; + amroutine->amfreebatch = NULL; amroutine->amgetbitmap = hashgetbitmap; amroutine->amendscan = hashendscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; + amroutine->amposreset = NULL; amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index dd4fe6bf6..c12db48a6 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -72,6 +72,33 @@ heapam_slot_callbacks(Relation relation) return &TTSOpsBufferHeapTuple; } +static void +StoreIndexTuple(TupleTableSlot *slot, + IndexTuple itup, TupleDesc itupdesc) +{ + /* + * Note: we must use the tupdesc supplied by the AM in index_deform_tuple, + * not the slot's tupdesc, in case the latter has different datatypes + * (this happens for btree name_ops in particular). They'd better have + * the same number of columns though, as well as being datatype-compatible + * which is something we can't so easily check. + */ + Assert(slot->tts_tupleDescriptor->natts == itupdesc->natts); + + ExecClearTuple(slot); + index_deform_tuple(itup, itupdesc, slot->tts_values, slot->tts_isnull); + + /* + * Copy all name columns stored as cstrings back into a NAMEDATALEN byte + * sized allocation. We mark this branch as unlikely as generally "name" + * is used only for the system catalogs and this would have to be a user + * query running on those or some other user table with an index on a name + * column. + */ + + ExecStoreVirtualTuple(slot); +} + /* ------------------------------------------------------------------------ * Index Scan Callbacks for heap AM @@ -79,12 +106,17 @@ heapam_slot_callbacks(Relation relation) */ static IndexFetchTableData * -heapam_index_fetch_begin(Relation rel) +heapam_index_fetch_begin(Relation rel, TupleTableSlot *ios_tableslot) { - IndexFetchHeapData *hscan = palloc0_object(IndexFetchHeapData); + IndexFetchHeapData *hscan = palloc_object(IndexFetchHeapData); hscan->xs_base.rel = rel; + hscan->xs_base.nheapaccesses = 0; + + /* heapam specific fields */ hscan->xs_cbuf = InvalidBuffer; + hscan->vmbuf = InvalidBuffer; + hscan->ios_tableslot = ios_tableslot; return &hscan->xs_base; } @@ -94,6 +126,7 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) { IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + /* deliberately don't drop VM buffer pin here */ if (BufferIsValid(hscan->xs_cbuf)) { ReleaseBuffer(hscan->xs_cbuf); @@ -108,6 +141,12 @@ heapam_index_fetch_end(IndexFetchTableData *scan) heapam_index_fetch_reset(scan); + if (hscan->vmbuf != InvalidBuffer) + { + ReleaseBuffer(hscan->vmbuf); + hscan->vmbuf = InvalidBuffer; + } + pfree(hscan); } @@ -173,6 +212,468 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, return got_heap_tuple; } +/* + * heap_batch_advance_pos + * Advance the position to the next item, depending on scan direction. + * + * Move to the next item within the batch pointed to by caller's pos. This is + * usually readPos. Advances the position to the next item, either in the + * same batch or the following one (if already available). + * + * We can advance only if we already have some batches loaded, and there's + * either enough items in the current batch, or some more items in the + * subsequent batches. + * + * If this is the first advance (right after loading the initial/head batch), + * position is still undefined. Otherwise we expect the position to be valid. + * + * Returns true if the position was advanced, false otherwise. The position + * is guaranteed to be valid only after a successful advance. + */ +static bool +heap_batch_advance_pos(IndexScanDesc scan, struct BatchQueueItemPos *pos, + ScanDirection direction) +{ + BatchIndexScan batch; + + /* make sure we have batching initialized and consistent */ + batch_assert_batches_valid(scan); + + /* should know direction by now */ + Assert(direction == scan->batchqueue->direction); + Assert(direction != NoMovementScanDirection); + + /* We can't advance if there are no batches available. */ + if (INDEX_SCAN_BATCH_COUNT(scan) == 0) + return false; + + /* + * If the position has not been advanced yet, it has to be right after we + * loaded the initial batch (must be the head batch). In that case just + * initialize it to the batch's first item (or its last item, when + * scanning backwards). + */ + if (INDEX_SCAN_POS_INVALID(pos)) + { + /* + * We should have loaded the scan's initial batch, or maybe we have + * changed the direction of the scan after scanning all the way to the + * end (in which case the position is invalid, and we make it look + * like there is just one batch). We should have just one batch, + * though. + */ + Assert(INDEX_SCAN_BATCH_COUNT(scan) == 1); + + /* + * Get the initial batch (which must be the head), and initialize the + * position to the appropriate item for the current scan direction + */ + batch = INDEX_SCAN_BATCH(scan, scan->batchqueue->headBatch); + + pos->batch = scan->batchqueue->headBatch; + + if (ScanDirectionIsForward(direction)) + pos->item = batch->firstItem; + else + pos->item = batch->lastItem; + + batch_assert_pos_valid(scan, pos); + + return true; + } + + /* + * The position is already defined, so we should have some batches loaded + * and the position has to be valid with respect to those. + */ + batch_assert_pos_valid(scan, pos); + + /* + * Advance to the next item in the same batch, if there are more items. If + * we're at the last item, we'll try advancing to the next batch later. + */ + batch = INDEX_SCAN_BATCH(scan, pos->batch); + + if (ScanDirectionIsForward(direction)) + { + if (++pos->item <= batch->lastItem) + { + batch_assert_pos_valid(scan, pos); + + return true; + } + } + else /* ScanDirectionIsBackward */ + { + if (--pos->item >= batch->firstItem) + { + batch_assert_pos_valid(scan, pos); + + return true; + } + } + + /* + * We couldn't advance within the same batch, try advancing to the next + * batch, if it's already loaded. + */ + if (INDEX_SCAN_BATCH_LOADED(scan, pos->batch + 1)) + { + /* advance to the next batch */ + pos->batch++; + + batch = INDEX_SCAN_BATCH(scan, pos->batch); + Assert(batch != NULL); + + if (ScanDirectionIsForward(direction)) + pos->item = batch->firstItem; + else + pos->item = batch->lastItem; + + batch_assert_pos_valid(scan, pos); + + return true; + } + + /* can't advance */ + return false; +} + +/* ---------------- + * heapam_batch_getnext_tid - get next TID from index scan batch queue + * + * This function implements heapam's version of getting the next TID from an + * index scan that uses the amgetbatch interface. It is implemented using + * various indexbatch.c utility routines. + * + * The routines from indexbatch.c are stateless -- they just implement batch + * queue mechanics. heapam_batch_getnext_tid implements the heapam policy; it + * decides when to load/free batches, and controls scan direction changes. + * ---------------- + */ +static ItemPointer +heapam_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction) +{ + BatchQueue *batchqueue = scan->batchqueue; + BatchQueueItemPos *readPos; + + /* shouldn't get here without batching */ + batch_assert_batches_valid(scan); + + /* Initialize direction on first call */ + if (batchqueue->direction == NoMovementScanDirection) + batchqueue->direction = direction; + else if (unlikely(batchqueue->direction != direction)) + { + /* + * Handle a change in the scan's direction. + * + * Release future batches properly, to make it look like the current + * batch is the only one we loaded. + */ + while (batchqueue->nextBatch > batchqueue->headBatch + 1) + { + /* release "later" batches in reverse order */ + BatchIndexScan fbatch; + + batchqueue->nextBatch--; + fbatch = INDEX_SCAN_BATCH(scan, batchqueue->nextBatch); + batch_free(scan, fbatch); + } + + /* + * Remember the new direction, and make sure the scan is not marked as + * "finished" (we might have already read the last batch, but now we + * need to start over). + */ + batchqueue->direction = direction; + batchqueue->finished = false; + } + + /* shortcut for the read position, for convenience */ + readPos = &batchqueue->readPos; + + /* + * Try advancing the batch position. If that doesn't succeed, it means we + * don't have more items in the current batch, and there's no future batch + * loaded. So try loading another batch, and retry if needed. + */ + while (true) + { + /* + * If we manage to advance to the next items, return it and we're + * done. Otherwise try loading another batch. + */ + if (heap_batch_advance_pos(scan, readPos, direction)) + { + BatchIndexScan readBatch = INDEX_SCAN_BATCH(scan, readPos->batch); + + /* set the TID / itup for the scan */ + scan->xs_heaptid = readBatch->items[readPos->item].heapTid; + + /* xs_hitup is not supported by amgetbatch scans */ + Assert(!scan->xs_hitup); + + if (scan->xs_want_itup) + scan->xs_itup = + (IndexTuple) (readBatch->currTuples + + readBatch->items[readPos->item].tupleOffset); + + /* + * If we advanced to the next batch, release the batch we no + * longer need. The positions is the "read" position, and we can + * compare it to headBatch. + */ + if (unlikely(readPos->batch != batchqueue->headBatch)) + { + BatchIndexScan headBatch = INDEX_SCAN_BATCH(scan, + batchqueue->headBatch); + + /* Free the head batch (except when it's markBatch) */ + batch_free(scan, headBatch); + + /* + * In any case, remove the batch from the regular queue, even + * if we kept it for mark/restore. + */ + batchqueue->headBatch++; + + /* we can't skip any batches */ + Assert(batchqueue->headBatch == readPos->batch); + } + + pgstat_count_index_tuples(scan->indexRelation, 1); + return &scan->xs_heaptid; + } + + /* + * Failed to advance the read position. Have indexbatch.c utility + * routine load another batch into our queue (next in this direction). + */ + if (!batch_getnext(scan, direction)) + { + /* we're done -- there's no more batches in this scan direction */ + break; + } + } + + /* + * If we get here, we failed to advance the position and there are no more + * batches to be loaded in the current scan direction. Defensively reset + * the read position. + */ + batch_reset_pos(readPos); + Assert(scan->batchqueue->finished); + + return NULL; +} + +/* ---------------- + * index_fetch_heap - get the scan's next heap tuple + * + * The result is a visible heap tuple associated with the index TID most + * recently fetched by our caller in scan->xs_heaptid, or NULL if no more + * matching tuples exist. (There can be more than one matching tuple because + * of HOT chains, although when using an MVCC snapshot it should be impossible + * for more than one such tuple to exist.) + * + * On success, the buffer containing the heap tup is pinned. The pin must be + * dropped elsewhere. + * ---------------- + */ +static bool +index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) +{ + bool all_dead = false; + bool found; + + found = heapam_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, + scan->xs_snapshot, slot, + &scan->xs_heap_continue, &all_dead); + + if (found) + pgstat_count_heap_fetch(scan->indexRelation); + + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + * + * XXX For scans using batching, record the flag in the batch (we will + * pass it to the AM later, when freeing it). Otherwise just pass it to + * the AM using the kill_prior_tuple field. + */ + if (!scan->xactStartedInRecovery) + { + if (scan->batchqueue == NULL) + scan->kill_prior_tuple = all_dead; + else if (all_dead) + index_batch_kill_item(scan); + } + + return found; +} + +/* ---------------- + * heapam_index_getnext_slot - get the next tuple from a scan + * + * The result is true if a tuple satisfying the scan keys and the snapshot was + * found, false otherwise. The tuple is stored in the specified slot. + * + * On success, resources (like buffer pins) are likely to be held, and will be + * dropped by a future call here (or by a later call to index_endscan). + * + * Note: caller must check scan->xs_recheck, and perform rechecking of the + * scan keys if required. We do not do that here because we don't have + * enough information to do it efficiently in the general case. + * ---------------- + */ +static bool +heapam_index_getnext_slot(IndexScanDesc scan, ScanDirection direction, + TupleTableSlot *slot) +{ + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan->xs_heapfetch; + ItemPointer tid = NULL; + + for (;;) + { + if (!scan->xs_heap_continue) + { + /* + * Scans that use an amgetbatch index AM are managed by heapam's + * index scan manager. This gives heapam the ability to read heap + * tuples in a flexible order that is attuned to both costs and + * benefits on the heapam and table AM side. + * + * Scans that use an amgettuple index AM simply call through to + * index_getnext_tid to get the next TID returned by index AM. The + * progress of the scan will be under the control of index AM (we + * just pass it through a direction to get the next tuple in), so + * we cannot reorder any work. + */ + if (scan->batchqueue != NULL) + tid = heapam_batch_getnext_tid(scan, direction); + else + tid = index_getnext_tid(scan, direction); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + } + + /* + * Fetch the next (or only) visible heap tuple for this index entry. + * If we don't find anything, loop around and grab the next TID from + * the index. + */ + Assert(ItemPointerIsValid(&scan->xs_heaptid)); + if (!scan->xs_want_itup) + { + /* Plain index scan */ + if (index_fetch_heap(scan, slot)) + return true; + } + else + { + /* + * Index-only scan. + * + * We can skip the heap fetch if the TID references a heap page on + * which all tuples are known visible to everybody. In any case, + * we'll use the index tuple not the heap tuple as the data + * source. + * + * Note on Memory Ordering Effects: visibilitymap_get_status does + * not lock the visibility map buffer, and therefore the result we + * read here could be slightly stale. However, it can't be stale + * enough to matter. + * + * We need to detect clearing a VM bit due to an insert right + * away, because the tuple is present in the index page but not + * visible. The reading of the TID by this scan (using a shared + * lock on the index buffer) is serialized with the insert of the + * TID into the index (using an exclusive lock on the index + * buffer). Because the VM bit is cleared before updating the + * index, and locking/unlocking of the index page acts as a full + * memory barrier, we are sure to see the cleared bit if we see a + * recently-inserted TID. + * + * Deletes do not update the index page (only VACUUM will clear + * out the TID), so the clearing of the VM bit by a delete is not + * serialized with this test below, and we may see a value that is + * significantly stale. However, we don't care about the delete + * right away, because the tuple is still visible until the + * deleting transaction commits or the statement ends (if it's our + * transaction). In either case, the lock on the VM buffer will + * have been released (acting as a write barrier) after clearing + * the bit. And for us to have a snapshot that includes the + * deleting transaction (making the tuple invisible), we must have + * acquired ProcArrayLock after that time, acting as a read + * barrier. + * + * It's worth going through this complexity to avoid needing to + * lock the VM buffer, which could cause significant contention. + */ + if (!VM_ALL_VISIBLE(hscan->xs_base.rel, + ItemPointerGetBlockNumber(tid), + &hscan->vmbuf)) + { + /* + * Rats, we have to visit the heap to check visibility. + */ + hscan->xs_base.nheapaccesses++; + if (!index_fetch_heap(scan, hscan->ios_tableslot)) + continue; /* no visible tuple, try next index entry */ + + /* + * selfuncs.c caller uses SnapshotNonVacuumable. Just assume + * that it's good enough that any one tuple from HOT chain is + * visible for such a caller + */ + if (unlikely(!IsMVCCSnapshot(scan->xs_snapshot))) + return true; + + ExecClearTuple(hscan->ios_tableslot); + + /* + * Only MVCC snapshots are supported here, so there should be + * no need to keep following the HOT chain once a visible + * entry has been found. + */ + if (scan->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + + /* + * Note: at this point we are holding a pin on the heap page, + * as recorded in IndexFetchHeapData.xs_cbuf. We could + * release that pin now, but we prefer to hold on to VM pins. + * it's quite possible that the index entry will require a + * visit to the same heap page. It's even more likely that + * the index entry will force us to perform a lookup that uses + * the same already-pinned VM page. + */ + if (scan->xs_itup) + StoreIndexTuple(slot, scan->xs_itup, scan->xs_itupdesc); + } + else + { + /* + * We didn't access the heap, so we'll need to take a + * predicate lock explicitly, as if we had. For now we do + * that at page level. + */ + PredicateLockPage(hscan->xs_base.rel, + ItemPointerGetBlockNumber(tid), + scan->xs_snapshot); + } + return true; + } + } + + return false; +} /* ------------------------------------------------------------------------ * Callbacks for non-modifying operations on individual tuples for heap AM @@ -753,7 +1254,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tableScan = NULL; heapScan = NULL; - indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0); + indexScan = index_beginscan(OldHeap, OldIndex, NULL, SnapshotAny, + NULL, 0, 0); index_rescan(indexScan, NULL, 0, NULL, 0); } else @@ -790,7 +1292,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, if (indexScan != NULL) { - if (!index_getnext_slot(indexScan, ForwardScanDirection, slot)) + if (!heapam_index_getnext_slot(indexScan, ForwardScanDirection, + slot)) break; /* Since we used no scan keys, should never need to recheck */ @@ -2633,6 +3136,7 @@ static const TableAmRoutine heapam_methods = { .index_fetch_begin = heapam_index_fetch_begin, .index_fetch_reset = heapam_index_fetch_reset, .index_fetch_end = heapam_index_fetch_end, + .index_getnext_slot = heapam_index_getnext_slot, .index_fetch_tuple = heapam_index_fetch_tuple, .tuple_insert = heapam_tuple_insert, diff --git a/src/backend/access/index/Makefile b/src/backend/access/index/Makefile index 6f2e3061a..e6d681b40 100644 --- a/src/backend/access/index/Makefile +++ b/src/backend/access/index/Makefile @@ -16,6 +16,7 @@ OBJS = \ amapi.o \ amvalidate.o \ genam.o \ - indexam.o + indexam.o \ + indexbatch.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 707c25289..9eadc3afc 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -89,6 +89,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_snapshot = InvalidSnapshot; /* caller must initialize this */ scan->numberOfKeys = nkeys; scan->numberOfOrderBys = norderbys; + scan->batchqueue = NULL; /* used by amgetbatch index AMs */ /* * We allocate key workspace here, but it won't get filled until amrescan. @@ -446,7 +447,7 @@ systable_beginscan(Relation heapRelation, elog(ERROR, "column is not in index"); } - sysscan->iscan = index_beginscan(heapRelation, irel, + sysscan->iscan = index_beginscan(heapRelation, irel, NULL, snapshot, NULL, nkeys, 0); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; @@ -517,7 +518,8 @@ systable_getnext(SysScanDesc sysscan) if (sysscan->irel) { - if (index_getnext_slot(sysscan->iscan, ForwardScanDirection, sysscan->slot)) + if (table_index_getnext_slot(sysscan->iscan, ForwardScanDirection, + sysscan->slot)) { bool shouldFree; @@ -707,7 +709,7 @@ systable_beginscan_ordered(Relation heapRelation, elog(ERROR, "column is not in index"); } - sysscan->iscan = index_beginscan(heapRelation, indexRelation, + sysscan->iscan = index_beginscan(heapRelation, indexRelation, NULL, snapshot, NULL, nkeys, 0); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; @@ -734,7 +736,7 @@ systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction) HeapTuple htup = NULL; Assert(sysscan->irel); - if (index_getnext_slot(sysscan->iscan, direction, sysscan->slot)) + if (table_index_getnext_slot(sysscan->iscan, direction, sysscan->slot)) htup = ExecFetchSlotHeapTuple(sysscan->slot, false, NULL); /* See notes in systable_getnext */ diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 0492d92d2..9e4ed5b55 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -24,9 +24,7 @@ * index_parallelscan_initialize - initialize parallel scan * index_parallelrescan - (re)start a parallel scan of an index * index_beginscan_parallel - join parallel index scan - * index_getnext_tid - get the next TID from a scan - * index_fetch_heap - get the scan's next heap tuple - * index_getnext_slot - get the next tuple from a scan + * index_getnext_tid - amgettuple table AM helper routine * index_getbitmap - get all tuples from a scan * index_bulk_delete - bulk deletion of index tuples * index_vacuum_cleanup - post-deletion cleanup of an index @@ -255,6 +253,7 @@ index_insert_cleanup(Relation indexRelation, IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, + TupleTableSlot *ios_tableslot, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys) @@ -283,8 +282,11 @@ index_beginscan(Relation heapRelation, scan->xs_snapshot = snapshot; scan->instrument = instrument; + if (indexRelation->rd_indam->amgetbatch != NULL) + index_batch_init(scan); + /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + scan->xs_heapfetch = table_index_fetch_begin(heapRelation, ios_tableslot); return scan; } @@ -380,6 +382,8 @@ index_rescan(IndexScanDesc scan, scan->kill_prior_tuple = false; /* for safety */ scan->xs_heap_continue = false; + index_batch_reset(scan, true); + scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys, orderbys, norderbys); } @@ -394,6 +398,9 @@ index_endscan(IndexScanDesc scan) SCAN_CHECKS; CHECK_SCAN_PROCEDURE(amendscan); + /* Cleanup batching, so that the AM can release pins and so on. */ + index_batch_end(scan); + /* Release resources (like buffer pins) from table accesses */ if (scan->xs_heapfetch) { @@ -422,9 +429,10 @@ void index_markpos(IndexScanDesc scan) { SCAN_CHECKS; - CHECK_SCAN_PROCEDURE(ammarkpos); + CHECK_SCAN_PROCEDURE(amposreset); - scan->indexRelation->rd_indam->ammarkpos(scan); + /* Only amgetbatch index AMs support mark and restore */ + return index_batch_mark_pos(scan); } /* ---------------- @@ -448,7 +456,8 @@ index_restrpos(IndexScanDesc scan) Assert(IsMVCCSnapshot(scan->xs_snapshot)); SCAN_CHECKS; - CHECK_SCAN_PROCEDURE(amrestrpos); + CHECK_SCAN_PROCEDURE(amgetbatch); + CHECK_SCAN_PROCEDURE(amposreset); /* release resources (like buffer pins) from table accesses */ if (scan->xs_heapfetch) @@ -457,7 +466,7 @@ index_restrpos(IndexScanDesc scan) scan->kill_prior_tuple = false; /* for safety */ scan->xs_heap_continue = false; - scan->indexRelation->rd_indam->amrestrpos(scan); + index_batch_restore_pos(scan); } /* @@ -579,6 +588,8 @@ index_parallelrescan(IndexScanDesc scan) if (scan->xs_heapfetch) table_index_fetch_reset(scan->xs_heapfetch); + index_batch_reset(scan, true); + /* amparallelrescan is optional; assume no-op if not provided by AM */ if (scan->indexRelation->rd_indam->amparallelrescan != NULL) scan->indexRelation->rd_indam->amparallelrescan(scan); @@ -591,6 +602,7 @@ index_parallelrescan(IndexScanDesc scan) */ IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, + TupleTableSlot *ios_tableslot, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan) @@ -614,17 +626,24 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, scan->xs_snapshot = snapshot; scan->instrument = instrument; + if (indexrel->rd_indam->amgetbatch != NULL) + index_batch_init(scan); + /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heaprel); + scan->xs_heapfetch = table_index_fetch_begin(heaprel, ios_tableslot); return scan; } /* ---------------- - * index_getnext_tid - get the next TID from a scan + * index_getnext_tid - amgettuple interface * * The result is the next TID satisfying the scan keys, * or NULL if no more matching tuples exist. + * + * This should only be called by table AM's index_getnext_slot implementation, + * and only given an index AM that supports the single-tuple amgettuple + * interface. * ---------------- */ ItemPointer @@ -667,97 +686,6 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) return &scan->xs_heaptid; } -/* ---------------- - * index_fetch_heap - get the scan's next heap tuple - * - * The result is a visible heap tuple associated with the index TID most - * recently fetched by index_getnext_tid, or NULL if no more matching tuples - * exist. (There can be more than one matching tuple because of HOT chains, - * although when using an MVCC snapshot it should be impossible for more than - * one such tuple to exist.) - * - * On success, the buffer containing the heap tup is pinned (the pin will be - * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan - * call). - * - * Note: caller must check scan->xs_recheck, and perform rechecking of the - * scan keys if required. We do not do that here because we don't have - * enough information to do it efficiently in the general case. - * ---------------- - */ -bool -index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) -{ - bool all_dead = false; - bool found; - - found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, - scan->xs_snapshot, slot, - &scan->xs_heap_continue, &all_dead); - - if (found) - pgstat_count_heap_fetch(scan->indexRelation); - - /* - * If we scanned a whole HOT chain and found only dead tuples, tell index - * AM to kill its entry for that TID (this will take effect in the next - * amgettuple call, in index_getnext_tid). We do not do this when in - * recovery because it may violate MVCC to do so. See comments in - * RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; - - return found; -} - -/* ---------------- - * index_getnext_slot - get the next tuple from a scan - * - * The result is true if a tuple satisfying the scan keys and the snapshot was - * found, false otherwise. The tuple is stored in the specified slot. - * - * On success, resources (like buffer pins) are likely to be held, and will be - * dropped by a future index_getnext_tid, index_fetch_heap or index_endscan - * call). - * - * Note: caller must check scan->xs_recheck, and perform rechecking of the - * scan keys if required. We do not do that here because we don't have - * enough information to do it efficiently in the general case. - * ---------------- - */ -bool -index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *slot) -{ - for (;;) - { - if (!scan->xs_heap_continue) - { - ItemPointer tid; - - /* Time to fetch the next TID from the index */ - tid = index_getnext_tid(scan, direction); - - /* If we're out of index entries, we're done */ - if (tid == NULL) - break; - - Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); - } - - /* - * Fetch the next (or only) visible heap tuple for this index entry. - * If we don't find anything, loop around and grab the next TID from - * the index. - */ - Assert(ItemPointerIsValid(&scan->xs_heaptid)); - if (index_fetch_heap(scan, slot)) - return true; - } - - return false; -} - /* ---------------- * index_getbitmap - get all tuples at once from an index scan * diff --git a/src/backend/access/index/indexbatch.c b/src/backend/access/index/indexbatch.c new file mode 100644 index 000000000..e30cc179c --- /dev/null +++ b/src/backend/access/index/indexbatch.c @@ -0,0 +1,640 @@ +/*------------------------------------------------------------------------- + * + * indexbatch.c + * amgetbatch implementation routines + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/index/indexbatch.c + * + * INTERFACE ROUTINES + * index_batch_init - Initialize fields needed by batching + * index_batch_reset - reset a batch + * index_batch_mark_pos - set a mark from current batch position + * index_batch_restore_pos - restore mark to current batch position + * index_batch_kill_item - record dead index tuple + * index_batch_end - end batch + * + * indexam_util_batch_unlock - unlock batch's buffer lock + * indexam_util_batch_alloc - allocate another batch + * indexam_util_batch_release - release allocated batch + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/amapi.h" +#include "access/tableam.h" +#include "optimizer/cost.h" +#include "pgstat.h" +#include "utils/memdebug.h" + +static void batch_debug_print_batches(const char *label, IndexScanDesc scan); + +/* + * index_batch_init + * Initialize various fields and arrays needed by batching. + * + * Sets up the batch queue structure and its initial read position. Also + * determines whether the scan will eagerly drop index page pins. It isn't + * safe to drop index page pins eagerly when doing so risks breaking an + * assumption (about table TID recycling) that amfreebatch routines make when + * setting LP_DEAD bits for known-dead index tuples. Specifically, buffer + * pins on index pages serve as interlocks preventing VACUUM from recycling + * TIDs on those pages, protecting the table AM from confusing a recycled TID + * with the original row it meant to reference. + */ +void +index_batch_init(IndexScanDesc scan) +{ + /* Both amgetbatch and amfreebatch must be present together */ + Assert(scan->indexRelation->rd_indam->amgetbatch != NULL); + Assert(scan->indexRelation->rd_indam->amfreebatch != NULL); + + scan->batchqueue = palloc(sizeof(BatchQueue)); + + /* + * We prefer to eagerly drop leaf page pins before amgetbatch returns. + * This avoids making VACUUM wait to acquire a cleanup lock on the page. + * + * We cannot safely drop leaf page pins during index-only scans due to a + * race condition involving VACUUM setting pages all-visible in the VM. + * It's also unsafe for plain index scans that use a non-MVCC snapshot. + * + * When we drop pins eagerly, the mechanism that marks index tuples as + * LP_DEAD has to deal with concurrent TID recycling races. The scheme + * used to detect unsafe TID recycling won't work when scanning unlogged + * relations (since it involves saving an affected page's LSN). Opt out + * of eager pin dropping during unlogged relation scans for now. + */ + scan->batchqueue->dropPin = + (!scan->xs_want_itup && IsMVCCSnapshot(scan->xs_snapshot) && + RelationNeedsWAL(scan->indexRelation)); + scan->batchqueue->finished = false; + scan->batchqueue->direction = NoMovementScanDirection; + + /* positions in the queue of batches */ + batch_reset_pos(&scan->batchqueue->readPos); + batch_reset_pos(&scan->batchqueue->markPos); + + scan->batchqueue->markBatch = NULL; + scan->batchqueue->headBatch = 0; /* initial head batch */ + scan->batchqueue->nextBatch = 0; /* initial batch starts empty */ + memset(&scan->batchqueue->cache, 0, sizeof(scan->batchqueue->cache)); +} + +/* ---------------- + * batch_getnext - get the next batch of TIDs from a scan + * + * Called by table AM's ordered index scan implementation when it needs to + * load the next batch of index entries to process in the given direction. + * + * The table AM controls the overall progress of the scan, deciding when to + * request new batches. This division of labor gives the table AM the ability + * to reorder fetches of nearby table tuples (from the same batch, or from + * adjacent batches) based on its own considerations. Importantly, table AMs + * are _not_ required to free a batch before loading the next batch during an + * index scan of an index that uses the amgetbatch/amfreebatch interface. + * (This isn't possible with the single-tuple amgettuple interface, which gives + * the index AM direct control over the progress of the index scan. amgettuple + * index scans perform the work that we perform in batch_free as the scan + * progresses, and without notifying the table AM, which makes it impossible + * to safely reorder work in the way that our callers can.) + * + * Returns true if we managed to read a batch of TIDs, or false if there are + * no more batches in the given scan direction. + * ---------------- + */ +bool +batch_getnext(IndexScanDesc scan, ScanDirection direction) +{ + BatchQueue *batchqueue = scan->batchqueue; + BatchIndexScan priorbatch = NULL, + batch = NULL; + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); + + /* Did we already read the last batch for this scan? */ + if (batchqueue->finished) + return false; + + Assert(!INDEX_SCAN_BATCH_FULL(scan)); + + batch_debug_print_batches("batch_getnext / start", scan); + + /* + * Get the previously returned batch to pass to amgetbatch. The index AM + * uses this to determine which index page to read next, typically by + * following page links forward or backward. + */ + if (batchqueue->headBatch < batchqueue->nextBatch) + priorbatch = INDEX_SCAN_BATCH(scan, batchqueue->nextBatch - 1); + + batch = scan->indexRelation->rd_indam->amgetbatch(scan, priorbatch, + direction); + if (batch != NULL) + { + /* We got the batch from the AM -- add it to our queue */ + int batchIndex = batchqueue->nextBatch; + + INDEX_SCAN_BATCH(scan, batchIndex) = batch; + + batchqueue->nextBatch++; + + DEBUG_LOG("batch_getnext headBatch %d nextBatch %d batch %p", + batchqueue->headBatch, batchqueue->nextBatch, batch); + } + else + batchqueue->finished = true; + + batch_assert_batches_valid(scan); + + batch_debug_print_batches("batch_getnext / end", scan); + + return (batch != NULL); +} + +/* ---------------- + * index_batch_reset - reset batch queue and read position + * + * Resets all loaded batches in the queue, and resets the read position to the + * initial state (or just initialize queue state). When 'complete' is true, + * also frees the scan's marked batch (if any), which is useful when ending an + * amgetbatch-based index scan. + * ---------------- + */ +void +index_batch_reset(IndexScanDesc scan, bool complete) +{ + BatchQueue *batchqueue = scan->batchqueue; + + /* bail out if batching not enabled */ + if (!batchqueue) + return; + + batch_assert_batches_valid(scan); + batch_debug_print_batches("index_batch_reset", scan); + Assert(scan->xs_heapfetch); + + /* reset the positions */ + batch_reset_pos(&batchqueue->readPos); + + /* + * With "complete" reset, make sure to also free the marked batch, either + * by just forgetting it (if it's still in the queue), or by explicitly + * freeing it. + */ + if (complete && unlikely(batchqueue->markBatch != NULL)) + { + BatchQueueItemPos *markPos = &batchqueue->markPos; + BatchIndexScan markBatch = batchqueue->markBatch; + + /* always reset the position, forget the marked batch */ + batchqueue->markBatch = NULL; + + /* + * If we've already moved past the marked batch (it's not in the + * current queue), free it explicitly. Otherwise it'll be in the freed + * later. + */ + if (markPos->batch < batchqueue->headBatch || + markPos->batch >= batchqueue->nextBatch) + batch_free(scan, markBatch); + + /* reset position only after the queue range check */ + batch_reset_pos(&batchqueue->markPos); + } + + /* now release all other currently loaded batches */ + while (batchqueue->headBatch < batchqueue->nextBatch) + { + BatchIndexScan batch = INDEX_SCAN_BATCH(scan, batchqueue->headBatch); + + DEBUG_LOG("freeing batch %d %p", batchqueue->headBatch, batch); + + batch_free(scan, batch); + + /* update the valid range, so that asserts / debugging works */ + batchqueue->headBatch++; + } + + /* reset relevant batch state fields */ + batchqueue->headBatch = 0; /* initial batch */ + batchqueue->nextBatch = 0; /* initial batch is empty */ + + batchqueue->finished = false; + + batch_assert_batches_valid(scan); +} + +/* ---------------- + * index_batch_mark_pos - mark current position in scan for restoration + * + * Saves the current read position and associated batch so that the scan can + * be restored to this point later, via a call to index_batch_restore_pos. + * The marked batch is retained and not freed until a new mark is set or the + * scan ends (or until the mark is restored). + * ---------------- + */ +void +index_batch_mark_pos(IndexScanDesc scan) +{ + BatchQueue *batchqueue = scan->batchqueue; + BatchQueueItemPos *markPos = &batchqueue->markPos; + BatchIndexScan markBatch = batchqueue->markBatch; + + /* + * Free the previous mark batch (if any), but only if the batch is no + * longer valid (in the current head/next range). Note that we don't have + * to do this in the common case where we mark a position that comes from + * our current readBatch. + */ + if (markBatch != NULL && (markPos->batch < batchqueue->headBatch || + markPos->batch >= batchqueue->nextBatch)) + { + batchqueue->markBatch = NULL; + batch_free(scan, markBatch); + } + + /* copy the read position */ + batchqueue->markPos = batchqueue->readPos; + batchqueue->markBatch = INDEX_SCAN_BATCH(scan, batchqueue->markPos.batch); + + /* readPos/markPos must be valid */ + batch_assert_pos_valid(scan, &batchqueue->markPos); +} + +/* ---------------- + * index_batch_restore_pos - restore scan to a previously marked position + * + * Restores the scan to a position previously saved by index_batch_mark_pos. + * The marked batch is restored as the current batch, allowing the scan to + * resume from the marked position. Also notifies the index AM via a call to + * its amposreset routine, which allows it to invalidate any private state + * that independently tracks scan progress (such as array key state) + * + * Function currently just discards most batch queue state. It might make + * sense to teach it to hold on to other nearby batches (still-held batches + * that are likely to be needed once the scan finishes returning matching + * items from the restored batch) as an optimization. Such a scheme would + * have the benefit of avoiding repeat calls to amgetbatch/repeatedly reading + * the same index pages. + * ---------------- + */ +void +index_batch_restore_pos(IndexScanDesc scan) +{ + BatchQueue *batchqueue = scan->batchqueue; + BatchQueueItemPos *markPos = &batchqueue->markPos; + BatchQueueItemPos *readPos = &batchqueue->readPos; + BatchIndexScan markBatch = batchqueue->markBatch; + + if (readPos->batch == markPos->batch && + readPos->batch == batchqueue->headBatch) + { + /* + * We don't have to discard the scan's state after all, since the + * current headBatch is also the batch that we're restoring to + */ + readPos->item = markPos->item; + return; + } + + /* + * Call amposreset to let index AM know to invalidate any private state + * that independently tracks the scan's progress + */ + scan->indexRelation->rd_indam->amposreset(scan, markBatch); + + /* + * Reset the batching state, except for the marked batch, and make it look + * like we have a single batch -- the marked one. + */ + index_batch_reset(scan, false); + + batchqueue->readPos = *markPos; + batchqueue->headBatch = markPos->batch; + batchqueue->nextBatch = markPos->batch + 1; + + INDEX_SCAN_BATCH(scan, batchqueue->markPos.batch) = markBatch; + batchqueue->markBatch = markBatch; +} + +/* + * batch_free + * Release resources associated with a batch returned by the index AM. + * + * Called by table AM's ordered index scan implementation when it is finished + * with a batch and wishes to release its resources. + * + * This calls the index AM's amfreebatch callback to release AM-specific + * resources, and to set LP_DEAD bits on the batch's index page. It isn't + * safe for table AMs to fetch table tuples using TIDs saved from a batch that + * was already freed: 'dropPin' scans need the index AM to retain a pin on the + * TID's index page, as an interlock against concurrent TID recycling. + */ +void +batch_free(IndexScanDesc scan, BatchIndexScan batch) +{ + batch_assert_batch_valid(scan, batch); + + /* don't free the batch that is marked */ + if (batch == scan->batchqueue->markBatch) + return; + + scan->indexRelation->rd_indam->amfreebatch(scan, batch); +} + +/* ---------------- + * index_batch_kill_item - record item for deferred LP_DEAD marking + * + * Records the item index of the currently-read tuple in readBatch's + * killedItems array. The items' index tuples will later be marked LP_DEAD + * when current readBatch is freed by amfreebatch routine (see batch_free). + * ---------------- + */ +void +index_batch_kill_item(IndexScanDesc scan) +{ + BatchQueueItemPos *readPos = &scan->batchqueue->readPos; + BatchIndexScan readBatch = INDEX_SCAN_BATCH(scan, readPos->batch); + + batch_assert_pos_valid(scan, readPos); + + if (readBatch->killedItems == NULL) + readBatch->killedItems = (int *) + palloc(readBatch->maxitems * sizeof(int)); + if (readBatch->numKilled < readBatch->maxitems) + readBatch->killedItems[readBatch->numKilled++] = readPos->item; +} + +/* ---------------- + * index_batch_end - end a batch scan and free all resources + * + * Called when an index scan is being ended, right before the owning scan + * descriptor goes away. Cleans up all batch related resources. + * ---------------- + */ +void +index_batch_end(IndexScanDesc scan) +{ + index_batch_reset(scan, true); + + /* bail out without batching */ + if (!scan->batchqueue) + return; + + for (int i = 0; i < INDEX_SCAN_CACHE_BATCHES; i++) + { + if (scan->batchqueue->cache[i] == NULL) + continue; + + pfree(scan->batchqueue->cache[i]); + } + + pfree(scan->batchqueue); +} + +/* ---------------------------------------------------------------- + * utility functions called by amgetbatch index AMs + * + * These functions manage batch allocation, unlock/pin management, and batch + * resource recycling. Index AMs implementing amgetbatch should use these + * rather than managing buffers directly. + * ---------------------------------------------------------------- + */ + +/* + * indexam_util_batch_unlock - Drop lock and conditionally drop pin on batch page + * + * Unlocks caller's batch->buf in preparation for amgetbatch returning items + * saved in that batch. Manages the details of dropping the lock and possibly + * the pin for index AM caller (dropping the pin prevents VACUUM from blocking + * on acquiring a cleanup lock, but isn't always safe). + * + * Only call here when a batch has one or more matching items to return using + * amgetbatch (or for amgetbitmap to load into its bitmap of matching TIDs). + * When an index page has no matches, it's always safe for index AMs to drop + * both the lock and the pin for themselves. + * + * Note: It is convenient for index AMs that implement both amgetbitmap and + * amgetbitmap to consistently use the same batch management approach, since + * that avoids introducing special cases to lower-level code. We always drop + * both the lock and the pin on batch's page on behalf of amgetbitmap callers. + * Such amgetbitmap callers must be careful to free all batches with matching + * items once they're done saving the matching TIDs (there will never be any + * calls to amfreebatch, so amgetbitmap must call indexam_util_batch_release + * directly, in lieu of a deferred call to amfreebatch from core code). + */ +void +indexam_util_batch_unlock(IndexScanDesc scan, BatchIndexScan batch) +{ + Relation rel = scan->indexRelation; + bool dropPin = !scan->batchqueue || scan->batchqueue->dropPin; + + /* batch must have one or more matching items returned by index AM */ + Assert(batch->firstItem >= 0 && batch->firstItem <= batch->lastItem); + + if (!dropPin) + { + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(batch->buf), BLCKSZ); + + /* Just drop the lock (not the pin) */ + LockBuffer(batch->buf, BUFFER_LOCK_UNLOCK); + return; + } + + if (scan->batchqueue) + { + /* amgetbatch (not amgetbitmap) caller */ + Assert(scan->heapRelation != NULL); + + /* + * Have to set batch->lsn so that amfreebatch has a way to detect when + * concurrent heap TID recycling by VACUUM might have taken place. + * It'll only be safe to set any index tuple LP_DEAD bits when the + * page LSN hasn't advanced. + */ + Assert(RelationNeedsWAL(rel)); + batch->lsn = BufferGetLSNAtomic(batch->buf); + } + + /* Drop both the lock and the pin */ + LockBuffer(batch->buf, BUFFER_LOCK_UNLOCK); + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(batch->buf), BLCKSZ); + ReleaseBuffer(batch->buf); + batch->buf = InvalidBuffer; /* defensive */ +} + +/* + * indexam_util_batch_alloc + * Allocate a batch that can fit maxitems-many BatchMatchingItems. + * + * Returns a BatchIndexScan sized to caller's required maxitems capacity. + * This will either be a newly allocated batch, or a batch reused from a cache + * of batches already freed by calling indexam_util_batch_release. See + * comments above indexam_util_batch_release. + * + * We assume that all calls here during the same index scan will always use + * the same maxitems and want_itup arguments. Index AMs that use batches + * should call this from either their amgetbatch or amgetbitmap routines. + * They must not call here from other routines (particularly not amfreebatch). + */ +BatchIndexScan +indexam_util_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup) +{ + BatchIndexScan batch = NULL; + + /* First look for an existing batch from queue's cache of batches */ + if (scan->batchqueue != NULL) + { + for (int i = 0; i < INDEX_SCAN_CACHE_BATCHES; i++) + { + if (scan->batchqueue->cache[i] != NULL) + { + /* Return cached unreferenced batch */ + batch = scan->batchqueue->cache[i]; + scan->batchqueue->cache[i] = NULL; + break; + } + } + } + + if (!batch) + { + batch = palloc(offsetof(BatchIndexScanData, items) + + sizeof(BatchMatchingItem) * maxitems); + + batch->maxitems = maxitems; + + /* + * If we are doing an index-only scan, we need a tuple storage + * workspace. We allocate BLCKSZ for this, which should always give + * the index AM enough space to fit a full page's worth of tuples. + */ + batch->currTuples = NULL; + if (want_itup) + batch->currTuples = palloc(BLCKSZ); + + /* + * Batches allocate killedItems lazily (though note that cached + * batches keep their killedItems allocation when recycled) + */ + batch->killedItems = NULL; + } + + /* want_itup callers must get a currTuples space */ + Assert(batch->maxitems == maxitems); + Assert(!(want_itup && (batch->currTuples == NULL))); + + /* shared initialization */ + batch->buf = InvalidBuffer; + batch->firstItem = -1; + batch->lastItem = -1; + batch->numKilled = 0; + + return batch; +} + +/* + * indexam_util_batch_release + * Either stash the batch in a small cache for reuse, or free it. + * + * This function is called by index AMs to release a batch allocated by + * indexam_util_batch_alloc. Batches are cached here for reuse (when scan + * hasn't already finished) to reduce palloc/pfree overhead. + * + * It's safe to release a batch immediately when it was used to read a page + * that returned no matches to the scan. Batches actually returned by index + * AM's amgetbatch routine (i.e. batches for pages with one or more matches) + * must be released by calling here at the end of their amfreebatch routine. + * Index AMs that uses batches should call here to release a batch from any of + * their amgetbatch, amgetbitmap, and amfreebatch routines. + */ +void +indexam_util_batch_release(IndexScanDesc scan, BatchIndexScan batch) +{ + Assert(batch->buf == InvalidBuffer); + + if (scan->batchqueue) + { + /* amgetbatch scan caller */ + Assert(scan->heapRelation != NULL); + + if (scan->batchqueue->finished) + { + /* Don't bother using cache when scan is ending */ + pfree(batch); + return; + } + + /* + * Use cache. This is generally only beneficial when there are many + * small rescans of an index. + */ + for (int i = 0; i < INDEX_SCAN_CACHE_BATCHES; i++) + { + if (scan->batchqueue->cache[i] == NULL) + { + /* found empty slot, we're done */ + scan->batchqueue->cache[i] = batch; + return; + } + } + + /* + * Failed to find a free slot for this batch. We'll just free it + * ourselves. This isn't really expected; it's just defensive. + */ + if (batch->killedItems) + pfree(batch->killedItems); + if (batch->currTuples) + pfree(batch->currTuples); + } + else + { + /* amgetbitmap scan caller */ + Assert(scan->heapRelation == NULL); + Assert(batch->killedItems == NULL); + Assert(batch->currTuples == NULL); + } + + /* no free slot to save this batch (expected with amgetbitmap callers) */ + pfree(batch); +} + +static void +batch_debug_print_batches(const char *label, IndexScanDesc scan) +{ +#ifdef INDEXAM_DEBUG + BatchQueue *batchqueue = scan->batchqueue; + + if (!scan->batchqueue) + return; + + if (!AmRegularBackendProcess()) + return; + if (IsCatalogRelation(scan->indexRelation)) + return; + + DEBUG_LOG("%s: batches headBatch %d nextBatch %d", + label, + batchqueue->headBatch, batchqueue->nextBatch); + + for (int i = batchqueue->headBatch; i < batchqueue->nextBatch; i++) + { + BatchIndexScan batch = INDEX_SCAN_BATCH(scan, i); + + DEBUG_LOG(" batch %d currPage %u %p firstItem %d lastItem %d killed %d", + i, batch->currPage, batch, batch->firstItem, + batch->lastItem, batch->numKilled); + } +#endif +} diff --git a/src/backend/access/index/meson.build b/src/backend/access/index/meson.build index e29c03089..c01fff708 100644 --- a/src/backend/access/index/meson.build +++ b/src/backend/access/index/meson.build @@ -5,4 +5,5 @@ backend_sources += files( 'amvalidate.c', 'genam.c', 'indexam.c', + 'indexbatch.c', ) diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index cfb07b2bc..15b788e3a 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1033,6 +1033,9 @@ _bt_relbuf(Relation rel, Buffer buf) * Lock is acquired without acquiring another pin. This is like a raw * LockBuffer() call, but performs extra steps needed by Valgrind. * + * Note: indexam_util_batch_unlock has similar Valgrind buffer lock + * instrumentation, which we rely on here. + * * Note: Caller may need to call _bt_checkpage() with buf when pin on buf * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf(). */ diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c index b3b8b5534..c0d22ee51 100644 --- a/src/backend/access/nbtree/nbtreadpage.c +++ b/src/backend/access/nbtree/nbtreadpage.c @@ -32,6 +32,7 @@ typedef struct BTReadPageState { /* Input parameters, set by _bt_readpage for _bt_checkkeys */ ScanDirection dir; /* current scan direction */ + BlockNumber currpage; /* current page being read */ OffsetNumber minoff; /* Lowest non-pivot tuple's offset */ OffsetNumber maxoff; /* Highest non-pivot tuple's offset */ IndexTuple finaltup; /* Needed by scans with array keys */ @@ -63,14 +64,13 @@ static bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup); static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup); -static void _bt_saveitem(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, IndexTuple itup); -static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, const ItemPointerData *heapTid, - IndexTuple itup); -static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, - ItemPointer heapTid, int tupleOffset); +static void _bt_saveitem(BatchIndexScan newbatch, int itemIndex, OffsetNumber offnum, + IndexTuple itup, int *tupleOffset); +static int _bt_setuppostingitems(BatchIndexScan newbatch, int itemIndex, + OffsetNumber offnum, const ItemPointerData *heapTid, + IndexTuple itup, int *tupleOffset); +static inline void _bt_savepostingitem(BatchIndexScan newbatch, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int baseOffset); static bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts); static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, @@ -111,15 +111,15 @@ static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); /* - * _bt_readpage() -- Load data from current index page into so->currPos + * _bt_readpage() -- Load data from current index page into newbatch. * - * Caller must have pinned and read-locked so->currPos.buf; the buffer's state - * is not changed here. Also, currPos.moreLeft and moreRight must be valid; - * they are updated as appropriate. All other fields of so->currPos are - * initialized from scratch here. + * Caller must have pinned and read-locked newbatch.buf; the buffer's state is + * not changed here. Also, pos.moreLeft and moreRight must be valid; they are + * updated as appropriate. All other fields of newbatch are initialized from + * scratch here. * * We scan the current page starting at offnum and moving in the indicated - * direction. All items matching the scan keys are loaded into currPos.items. + * direction. All items matching the scan keys are saved in newbatch.items. * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports * that there can be no more matching tuples in the current scan direction * (could just be for the current primitive index scan when scan has arrays). @@ -131,8 +131,8 @@ static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); * Returns true if any matching items found on the page, false if none. */ bool -_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, - bool firstpage) +_bt_readpage(IndexScanDesc scan, BatchIndexScan newbatch, ScanDirection dir, + OffsetNumber offnum, bool firstpage) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; @@ -144,23 +144,20 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool arrayKeys, ignore_killed_tuples = scan->ignore_killed_tuples; int itemIndex, + tupleOffset = 0, indnatts; /* save the page/buffer block number, along with its sibling links */ - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(newbatch->buf); opaque = BTPageGetOpaque(page); - so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); - so->currPos.prevPage = opaque->btpo_prev; - so->currPos.nextPage = opaque->btpo_next; - /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */ - pstate.dir = so->currPos.dir = dir; - so->currPos.nextTupleOffset = 0; + pstate.currpage = newbatch->currPage = BufferGetBlockNumber(newbatch->buf); + newbatch->prevPage = opaque->btpo_prev; + newbatch->nextPage = opaque->btpo_next; + pstate.dir = newbatch->dir = dir; /* either moreRight or moreLeft should be set now (may be unset later) */ - Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : - so->currPos.moreLeft); + Assert(ScanDirectionIsForward(dir) ? newbatch->moreRight : newbatch->moreLeft); Assert(!P_IGNORE(opaque)); - Assert(BTScanPosIsPinned(so->currPos)); Assert(!so->needPrimScan); /* initialize local variables */ @@ -188,14 +185,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, { /* allow next/prev page to be read by other worker without delay */ if (ScanDirectionIsForward(dir)) - _bt_parallel_release(scan, so->currPos.nextPage, - so->currPos.currPage); + _bt_parallel_release(scan, newbatch->nextPage, newbatch->currPage); else - _bt_parallel_release(scan, so->currPos.prevPage, - so->currPos.currPage); + _bt_parallel_release(scan, newbatch->prevPage, newbatch->currPage); } - PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); + PredicateLockPage(rel, newbatch->currPage, scan->xs_snapshot); if (ScanDirectionIsForward(dir)) { @@ -212,11 +207,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) { /* Schedule another primitive index scan after all */ - so->currPos.moreRight = false; + newbatch->moreRight = false; so->needPrimScan = true; if (scan->parallel_scan) - _bt_parallel_primscan_schedule(scan, - so->currPos.currPage); + _bt_parallel_primscan_schedule(scan, newbatch->currPage); return false; } } @@ -280,26 +274,26 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, if (!BTreeTupleIsPosting(itup)) { /* Remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(newbatch, itemIndex, offnum, itup, &tupleOffset); itemIndex++; } else { - int tupleOffset; + int baseOffset; /* Set up posting list state (and remember first TID) */ - tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, + baseOffset = + _bt_setuppostingitems(newbatch, itemIndex, offnum, BTreeTupleGetPostingN(itup, 0), - itup); + itup, &tupleOffset); itemIndex++; /* Remember all later TIDs (must be at least one) */ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) { - _bt_savepostingitem(so, itemIndex, offnum, + _bt_savepostingitem(newbatch, itemIndex, offnum, BTreeTupleGetPostingN(itup, i), - tupleOffset); + baseOffset); itemIndex++; } } @@ -339,12 +333,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } if (!pstate.continuescan) - so->currPos.moreRight = false; + newbatch->moreRight = false; Assert(itemIndex <= MaxTIDsPerBTreePage); - so->currPos.firstItem = 0; - so->currPos.lastItem = itemIndex - 1; - so->currPos.itemIndex = 0; + newbatch->firstItem = 0; + newbatch->lastItem = itemIndex - 1; } else { @@ -361,11 +354,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) { /* Schedule another primitive index scan after all */ - so->currPos.moreLeft = false; + newbatch->moreLeft = false; so->needPrimScan = true; if (scan->parallel_scan) - _bt_parallel_primscan_schedule(scan, - so->currPos.currPage); + _bt_parallel_primscan_schedule(scan, newbatch->currPage); return false; } } @@ -466,27 +458,27 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, { /* Remember it */ itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(newbatch, itemIndex, offnum, itup, &tupleOffset); } else { uint16 nitems = BTreeTupleGetNPosting(itup); - int tupleOffset; + int baseOffset; /* Set up posting list state (and remember last TID) */ itemIndex--; - tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, + baseOffset = + _bt_setuppostingitems(newbatch, itemIndex, offnum, BTreeTupleGetPostingN(itup, nitems - 1), - itup); + itup, &tupleOffset); /* Remember all prior TIDs (must be at least one) */ for (int i = nitems - 2; i >= 0; i--) { itemIndex--; - _bt_savepostingitem(so, itemIndex, offnum, + _bt_savepostingitem(newbatch, itemIndex, offnum, BTreeTupleGetPostingN(itup, i), - tupleOffset); + baseOffset); } } } @@ -502,12 +494,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * be found there */ if (!pstate.continuescan) - so->currPos.moreLeft = false; + newbatch->moreLeft = false; Assert(itemIndex >= 0); - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxTIDsPerBTreePage - 1; - so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; + newbatch->firstItem = itemIndex; + newbatch->lastItem = MaxTIDsPerBTreePage - 1; } /* @@ -524,7 +515,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, */ Assert(!pstate.forcenonrequired); - return (so->currPos.firstItem <= so->currPos.lastItem); + return (newbatch->firstItem <= newbatch->lastItem); } /* @@ -1027,90 +1018,93 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, return true; } -/* Save an index item into so->currPos.items[itemIndex] */ +/* Save an index item into newbatch.items[itemIndex] */ static void -_bt_saveitem(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, IndexTuple itup) +_bt_saveitem(BatchIndexScan newbatch, int itemIndex, OffsetNumber offnum, + IndexTuple itup, int *tupleOffset) { - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; - Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); - currItem->heapTid = itup->t_tid; - currItem->indexOffset = offnum; - if (so->currTuples) + /* copy the populated part of the items array */ + newbatch->items[itemIndex].heapTid = itup->t_tid; + newbatch->items[itemIndex].indexOffset = offnum; + + if (newbatch->currTuples) { Size itupsz = IndexTupleSize(itup); - currItem->tupleOffset = so->currPos.nextTupleOffset; - memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); - so->currPos.nextTupleOffset += MAXALIGN(itupsz); + newbatch->items[itemIndex].tupleOffset = *tupleOffset; + memcpy(newbatch->currTuples + *tupleOffset, itup, itupsz); + *tupleOffset += MAXALIGN(itupsz); } } /* * Setup state to save TIDs/items from a single posting list tuple. * - * Saves an index item into so->currPos.items[itemIndex] for TID that is - * returned to scan first. Second or subsequent TIDs for posting list should - * be saved by calling _bt_savepostingitem(). + * Saves an index item into newbatch.items[itemIndex] for TID that is returned + * to scan first. Second or subsequent TIDs for posting list should be saved + * by calling _bt_savepostingitem(). * - * Returns an offset into tuple storage space that main tuple is stored at if - * needed. + * Returns baseOffset, an offset into tuple storage space that main tuple is + * stored at if needed. */ static int -_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, - const ItemPointerData *heapTid, IndexTuple itup) +_bt_setuppostingitems(BatchIndexScan newbatch, int itemIndex, + OffsetNumber offnum, const ItemPointerData *heapTid, + IndexTuple itup, int *tupleOffset) { - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + BatchMatchingItem *item = &newbatch->items[itemIndex]; Assert(BTreeTupleIsPosting(itup)); - currItem->heapTid = *heapTid; - currItem->indexOffset = offnum; - if (so->currTuples) + /* copy the populated part of the items array */ + item->heapTid = *heapTid; + item->indexOffset = offnum; + + if (newbatch->currTuples) { /* Save base IndexTuple (truncate posting list) */ IndexTuple base; Size itupsz = BTreeTupleGetPostingOffset(itup); itupsz = MAXALIGN(itupsz); - currItem->tupleOffset = so->currPos.nextTupleOffset; - base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + item->tupleOffset = *tupleOffset; + base = (IndexTuple) (newbatch->currTuples + *tupleOffset); memcpy(base, itup, itupsz); /* Defensively reduce work area index tuple header size */ base->t_info &= ~INDEX_SIZE_MASK; base->t_info |= itupsz; - so->currPos.nextTupleOffset += itupsz; + *tupleOffset += itupsz; - return currItem->tupleOffset; + return item->tupleOffset; } return 0; } /* - * Save an index item into so->currPos.items[itemIndex] for current posting + * Save an index item into newbatch.items[itemIndex] for current posting * tuple. * * Assumes that _bt_setuppostingitems() has already been called for current - * posting list tuple. Caller passes its return value as tupleOffset. + * posting list tuple. Caller passes its return value as baseOffset. */ static inline void -_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, - ItemPointer heapTid, int tupleOffset) +_bt_savepostingitem(BatchIndexScan newbatch, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int baseOffset) { - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + BatchMatchingItem *item = &newbatch->items[itemIndex]; - currItem->heapTid = *heapTid; - currItem->indexOffset = offnum; + item->heapTid = *heapTid; + item->indexOffset = offnum; /* * Have index-only scans return the same base IndexTuple for every TID * that originates from the same posting list */ - if (so->currTuples) - currItem->tupleOffset = tupleOffset; + if (newbatch->currTuples) + item->tupleOffset = baseOffset; } #define LOOK_AHEAD_REQUIRED_RECHECKS 3 @@ -2822,13 +2816,13 @@ new_prim_scan: * Note: We make a soft assumption that the current scan direction will * also be used within _bt_next, when it is asked to step off this page. * It is up to _bt_next to cancel this scheduled primitive index scan - * whenever it steps to a page in the direction opposite currPos.dir. + * whenever it steps to a page in the direction opposite pstate->dir. */ pstate->continuescan = false; /* Tell _bt_readpage we're done... */ so->needPrimScan = true; /* ...but call _bt_first again */ if (scan->parallel_scan) - _bt_parallel_primscan_schedule(scan, so->currPos.currPage); + _bt_parallel_primscan_schedule(scan, pstate->currpage); /* Caller's tuple doesn't match the new qual */ return false; @@ -2913,14 +2907,6 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, * Restore the array keys to the state they were in immediately before we * were called. This ensures that the arrays only ever ratchet in the * current scan direction. - * - * Without this, scans could overlook matching tuples when the scan - * direction gets reversed just before btgettuple runs out of items to - * return, but just after _bt_readpage prepares all the items from the - * scan's final page in so->currPos. When we're on the final page it is - * typical for so->currPos to get invalidated once btgettuple finally - * returns false, which'll effectively invalidate the scan's array keys. - * That hasn't happened yet, though -- and in general it may never happen. */ _bt_start_array_keys(scan, -dir); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 6197b725f..1dcebd8ae 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -159,11 +159,12 @@ bthandler(PG_FUNCTION_ARGS) amroutine->amadjustmembers = btadjustmembers; amroutine->ambeginscan = btbeginscan; amroutine->amrescan = btrescan; - amroutine->amgettuple = btgettuple; + amroutine->amgettuple = NULL; + amroutine->amgetbatch = btgetbatch; + amroutine->amfreebatch = btfreebatch; amroutine->amgetbitmap = btgetbitmap; amroutine->amendscan = btendscan; - amroutine->ammarkpos = btmarkpos; - amroutine->amrestrpos = btrestrpos; + amroutine->amposreset = btposreset; amroutine->amestimateparallelscan = btestimateparallelscan; amroutine->aminitparallelscan = btinitparallelscan; amroutine->amparallelrescan = btparallelrescan; @@ -221,13 +222,13 @@ btinsert(Relation rel, Datum *values, bool *isnull, } /* - * btgettuple() -- Get the next tuple in the scan. + * btgetbatch() -- Get the first or next batch of tuples in the scan */ -bool -btgettuple(IndexScanDesc scan, ScanDirection dir) +BatchIndexScan +btgetbatch(IndexScanDesc scan, BatchIndexScan priorbatch, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - bool res; + BatchIndexScan batch = priorbatch; Assert(scan->heapRelation != NULL); @@ -242,43 +243,18 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) * the appropriate direction. If we haven't done so yet, we call * _bt_first() to get the first item in the scan. */ - if (!BTScanPosIsValid(so->currPos)) - res = _bt_first(scan, dir); + if (batch == NULL) + batch = _bt_first(scan, dir); else - { - /* - * Check to see if we should kill the previously-fetched tuple. - */ - if (scan->kill_prior_tuple) - { - /* - * Yes, remember it for later. (We'll deal with all such - * tuples at once right before leaving the index page.) The - * test for numKilled overrun is not just paranoia: if the - * caller reverses direction in the indexscan then the same - * item might get entered multiple times. It's not worth - * trying to optimize that, so we don't detect it, but instead - * just forget any excess entries. - */ - if (so->killedItems == NULL) - so->killedItems = palloc_array(int, MaxTIDsPerBTreePage); - if (so->numKilled < MaxTIDsPerBTreePage) - so->killedItems[so->numKilled++] = so->currPos.itemIndex; - } + batch = _bt_next(scan, dir, batch); - /* - * Now continue the scan. - */ - res = _bt_next(scan, dir); - } - - /* If we have a tuple, return it ... */ - if (res) + /* If we have a batch, return it ... */ + if (batch) break; /* ... otherwise see if we need another primitive index scan */ } while (so->numArrayKeys && _bt_start_prim_scan(scan)); - return res; + return batch; } /* @@ -288,6 +264,7 @@ int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BatchIndexScan batch; int64 ntids = 0; ItemPointer heapTid; @@ -296,29 +273,29 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) /* Each loop iteration performs another primitive index scan */ do { - /* Fetch the first page & tuple */ - if (_bt_first(scan, ForwardScanDirection)) + /* Fetch the first batch */ + if ((batch = _bt_first(scan, ForwardScanDirection))) { - /* Save tuple ID, and continue scanning */ - heapTid = &scan->xs_heaptid; + int itemIndex = 0; + + /* Save first tuple's TID */ + heapTid = &batch->items[itemIndex].heapTid; tbm_add_tuples(tbm, heapTid, 1, false); ntids++; for (;;) { - /* - * Advance to next tuple within page. This is the same as the - * easy case in _bt_next(). - */ - if (++so->currPos.itemIndex > so->currPos.lastItem) + /* Advance to next TID within page-sized batch */ + if (++itemIndex > batch->lastItem) { /* let _bt_next do the heavy lifting */ - if (!_bt_next(scan, ForwardScanDirection)) + itemIndex = 0; + batch = _bt_next(scan, ForwardScanDirection, batch); + if (!batch) break; } - /* Save tuple ID, and continue scanning */ - heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; + heapTid = &batch->items[itemIndex].heapTid; tbm_add_tuples(tbm, heapTid, 1, false); ntids++; } @@ -346,8 +323,6 @@ btbeginscan(Relation rel, int nkeys, int norderbys) /* allocate private workspace */ so = palloc_object(BTScanOpaqueData); - BTScanPosInvalidate(so->currPos); - BTScanPosInvalidate(so->markPos); if (scan->numberOfKeys > 0) so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); else @@ -361,19 +336,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->orderProcs = NULL; so->arrayContext = NULL; - so->killedItems = NULL; /* until needed */ - so->numKilled = 0; - - /* - * We don't know yet whether the scan will be index-only, so we do not - * allocate the tuple workspace arrays until btrescan. However, we set up - * scan->xs_itupdesc whether we'll need it or not, since that's so cheap. - */ - so->currTuples = so->markTuples = NULL; - - scan->xs_itupdesc = RelationGetDescr(rel); - scan->opaque = so; + scan->xs_itupdesc = RelationGetDescr(rel); return scan; } @@ -387,82 +351,37 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* we aren't holding any read locks, but gotta drop the pins */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - BTScanPosInvalidate(so->currPos); - } - - /* - * We prefer to eagerly drop leaf page pins before btgettuple returns. - * This avoids making VACUUM wait to acquire a cleanup lock on the page. - * - * We cannot safely drop leaf page pins during index-only scans due to a - * race condition involving VACUUM setting pages all-visible in the VM. - * It's also unsafe for plain index scans that use a non-MVCC snapshot. - * - * When we drop pins eagerly, the mechanism that marks so->killedItems[] - * index tuples LP_DEAD has to deal with concurrent TID recycling races. - * The scheme used to detect unsafe TID recycling won't work when scanning - * unlogged relations (since it involves saving an affected page's LSN). - * Opt out of eager pin dropping during unlogged relation scans for now - * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting). - * - * Also opt out of dropping leaf page pins eagerly during bitmap scans. - * Pins cannot be held for more than an instant during bitmap scans either - * way, so we might as well avoid wasting cycles on acquiring page LSNs. - * - * See nbtree/README section on making concurrent TID recycling safe. - * - * Note: so->dropPin should never change across rescans. - */ - so->dropPin = (!scan->xs_want_itup && - IsMVCCSnapshot(scan->xs_snapshot) && - RelationNeedsWAL(scan->indexRelation) && - scan->heapRelation != NULL); - - so->markItemIndex = -1; - so->needPrimScan = false; - so->scanBehind = false; - so->oppositeDirCheck = false; - BTScanPosUnpinIfPinned(so->markPos); - BTScanPosInvalidate(so->markPos); - - /* - * Allocate tuple workspace arrays, if needed for an index-only scan and - * not already done in a previous rescan call. To save on palloc - * overhead, both workspaces are allocated as one palloc block; only this - * function and btendscan know that. - * - * NOTE: this data structure also makes it safe to return data from a - * "name" column, even though btree name_ops uses an underlying storage - * datatype of cstring. The risk there is that "name" is supposed to be - * padded to NAMEDATALEN, but the actual index tuple is probably shorter. - * However, since we only return data out of tuples sitting in the - * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some - * data out of the markTuples array --- running off the end of memory for - * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats - * adding special-case treatment for name_ops elsewhere. - */ - if (scan->xs_want_itup && so->currTuples == NULL) - { - so->currTuples = (char *) palloc(BLCKSZ * 2); - so->markTuples = so->currTuples + BLCKSZ; - } - /* * Reset the scan keys */ if (scankey && scan->numberOfKeys > 0) memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); + so->needPrimScan = false; + so->scanBehind = false; + so->oppositeDirCheck = false; so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ so->numArrayKeys = 0; /* ditto */ } +/* + * btfreebatch() -- Free batch resources, including its buffer pin + */ +void +btfreebatch(IndexScanDesc scan, BatchIndexScan batch) +{ + if (batch->numKilled > 0) + _bt_killitems(scan, batch); + + if (!scan->batchqueue->dropPin) + { + /* indexam_util_batch_unlock didn't unpin page earlier, do it now */ + ReleaseBuffer(batch->buf); + batch->buf = InvalidBuffer; + } + + indexam_util_batch_release(scan, batch); +} + /* * btendscan() -- close down a scan */ @@ -471,116 +390,48 @@ btendscan(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* we aren't holding any read locks, but gotta drop the pins */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - } - - so->markItemIndex = -1; - BTScanPosUnpinIfPinned(so->markPos); - - /* No need to invalidate positions, the RAM is about to be freed. */ - /* Release storage */ if (so->keyData != NULL) pfree(so->keyData); /* so->arrayKeys and so->orderProcs are in arrayContext */ if (so->arrayContext != NULL) MemoryContextDelete(so->arrayContext); - if (so->killedItems != NULL) - pfree(so->killedItems); - if (so->currTuples != NULL) - pfree(so->currTuples); - /* so->markTuples should not be pfree'd, see btrescan */ pfree(so); } /* - * btmarkpos() -- save current scan position + * btposreset() -- invalidate scan's array keys */ void -btmarkpos(IndexScanDesc scan) +btposreset(IndexScanDesc scan, BatchIndexScan markbatch) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* There may be an old mark with a pin (but no lock). */ - BTScanPosUnpinIfPinned(so->markPos); + if (!so->numArrayKeys) + return; /* - * Just record the current itemIndex. If we later step to next page - * before releasing the marked position, _bt_steppage makes a full copy of - * the currPos struct in markPos. If (as often happens) the mark is moved - * before we leave the page, we don't have to do that work. + * Core system is about to restore a mark associated with a previously + * returned batch. Reset the scan's arrays to make all this safe. */ - if (BTScanPosIsValid(so->currPos)) - so->markItemIndex = so->currPos.itemIndex; + _bt_start_array_keys(scan, markbatch->dir); + + /* + * Core system will invalidate all other batches. + * + * Deal with this by unsetting needPrimScan as well as moreRight (or as + * well as moreLeft, when scanning backwards). That way, the next time + * _bt_next is called it will step to the right (or to the left). At that + * point _bt_readpage will restore the scan's arrays to elements that + * correctly track the next page's position in the index's key space. + */ + if (ScanDirectionIsForward(markbatch->dir)) + markbatch->moreRight = true; else - { - BTScanPosInvalidate(so->markPos); - so->markItemIndex = -1; - } -} - -/* - * btrestrpos() -- restore scan to last saved position - */ -void -btrestrpos(IndexScanDesc scan) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - if (so->markItemIndex >= 0) - { - /* - * The scan has never moved to a new page since the last mark. Just - * restore the itemIndex. - * - * NB: In this case we can't count on anything in so->markPos to be - * accurate. - */ - so->currPos.itemIndex = so->markItemIndex; - } - else - { - /* - * The scan moved to a new page after last mark or restore, and we are - * now restoring to the marked page. We aren't holding any read - * locks, but if we're still holding the pin for the current position, - * we must drop it. - */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - } - - if (BTScanPosIsValid(so->markPos)) - { - /* bump pin on mark buffer for assignment to current buffer */ - if (BTScanPosIsPinned(so->markPos)) - IncrBufferRefCount(so->markPos.buf); - memcpy(&so->currPos, &so->markPos, - offsetof(BTScanPosData, items[1]) + - so->markPos.lastItem * sizeof(BTScanPosItem)); - if (so->currTuples) - memcpy(so->currTuples, so->markTuples, - so->markPos.nextTupleOffset); - /* Reset the scan's array keys (see _bt_steppage for why) */ - if (so->numArrayKeys) - { - _bt_start_array_keys(scan, so->currPos.dir); - so->needPrimScan = false; - } - } - else - BTScanPosInvalidate(so->currPos); - } + markbatch->moreLeft = true; + so->needPrimScan = false; + so->scanBehind = false; + so->oppositeDirCheck = false; } /* @@ -896,15 +747,6 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, *next_scan_page = InvalidBlockNumber; *last_curr_page = InvalidBlockNumber; - /* - * Reset so->currPos, and initialize moreLeft/moreRight such that the next - * call to _bt_readnextpage treats this backend similarly to a serial - * backend that steps from *last_curr_page to *next_scan_page (unless this - * backend's so->currPos is initialized by _bt_readfirstpage before then). - */ - BTScanPosInvalidate(so->currPos); - so->currPos.moreLeft = so->currPos.moreRight = true; - if (first) { /* @@ -1054,8 +896,6 @@ _bt_parallel_done(IndexScanDesc scan) BTParallelScanDesc btscan; bool status_changed = false; - Assert(!BTScanPosIsValid(so->currPos)); - /* Do nothing, for non-parallel scans */ if (parallel_scan == NULL) return; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index aec710936..4fa593f2c 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,52 +25,21 @@ #include "utils/rel.h" -static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so); static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum); -static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so); -static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); -static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, - ScanDirection dir); -static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, - BlockNumber lastcurrblkno, ScanDirection dir, - bool seized); +static BatchIndexScan _bt_readfirstpage(IndexScanDesc scan, BatchIndexScan firstbatch, + OffsetNumber offnum, ScanDirection dir); +static BatchIndexScan _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, + BlockNumber lastcurrblkno, + ScanDirection dir, bool firstpage); static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno); -static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); - - -/* - * _bt_drop_lock_and_maybe_pin() - * - * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too. - * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock. - */ -static inline void -_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so) -{ - if (!so->dropPin) - { - /* Just drop the lock (not the pin) */ - _bt_unlockbuf(rel, so->currPos.buf); - return; - } - - /* - * Drop both the lock and the pin. - * - * Have to set so->currPos.lsn so that _bt_killitems has a way to detect - * when concurrent heap TID recycling by VACUUM might have taken place. - */ - Assert(RelationNeedsWAL(rel)); - so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); - _bt_relbuf(rel, so->currPos.buf); - so->currPos.buf = InvalidBuffer; -} +static BatchIndexScan _bt_endpoint(IndexScanDesc scan, ScanDirection dir, + BatchIndexScan firstbatch); /* * _bt_search() -- Search the tree for a particular scankey, @@ -860,20 +829,16 @@ _bt_compare(Relation rel, * conditions, and the tree ordering. We find the first item (or, * if backwards scan, the last item) in the tree that satisfies the * qualifications in the scan key. On success exit, data about the - * matching tuple(s) on the page has been loaded into so->currPos. We'll - * drop all locks and hold onto a pin on page's buffer, except during - * so->dropPin scans, when we drop both the lock and the pin. - * _bt_returnitem sets the next item to return to scan on success exit. + * matching tuple(s) on the page has been loaded into the returned batch. * - * If there are no matching items in the index, we return false, with no - * pins or locks held. so->currPos will remain invalid. + * If there are no matching items in the index, we just return NULL. * * Note that scan->keyData[], and the so->keyData[] scankey built from it, * are both search-type scankeys (see nbtree/README for more about this). * Within this routine, we build a temporary insertion-type scankey to use * in locating the scan start position. */ -bool +BatchIndexScan _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; @@ -887,8 +852,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) StrategyNumber strat_total = InvalidStrategy; BlockNumber blkno = InvalidBlockNumber, lastcurrblkno; - - Assert(!BTScanPosIsValid(so->currPos)); + BatchIndexScan firstbatch; /* * Examine the scan keys and eliminate any redundant keys; also mark the @@ -913,7 +877,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ if (scan->parallel_scan != NULL && !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true)) - return false; + return false; /* definitely done (so->needPrimScan is unset) */ /* * Initialize the scan's arrays (if any) for the current scan direction @@ -930,14 +894,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * _bt_readnextpage releases the scan for us (not _bt_readfirstpage). */ Assert(scan->parallel_scan != NULL); - Assert(!so->needPrimScan); - Assert(blkno != P_NONE); - if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true)) - return false; - - _bt_returnitem(scan, so); - return true; + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, true); } /* @@ -1229,6 +1187,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } + /* Allocate space for first batch */ + firstbatch = indexam_util_batch_alloc(scan, MaxTIDsPerBTreePage, + scan->xs_want_itup); + /* * If we found no usable boundary keys, we have to start from one end of * the tree. Walk down that edge to the first or last key, and scan from @@ -1237,7 +1199,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * Note: calls _bt_readfirstpage for us, which releases the parallel scan. */ if (keysz == 0) - return _bt_endpoint(scan, dir); + return _bt_endpoint(scan, dir, firstbatch); /* * We want to start the scan somewhere within the index. Set up an @@ -1505,12 +1467,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * position ourselves on the target leaf page. */ Assert(ScanDirectionIsBackward(dir) == inskey.backward); - stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ); + stack = _bt_search(rel, NULL, &inskey, &firstbatch->buf, BT_READ); /* don't need to keep the stack around... */ _bt_freestack(stack); - if (!BufferIsValid(so->currPos.buf)) + if (!BufferIsValid(firstbatch->buf)) { Assert(!so->needPrimScan); @@ -1526,11 +1488,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (IsolationIsSerializable()) { PredicateLockRelation(rel, scan->xs_snapshot); - stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ); + stack = _bt_search(rel, NULL, &inskey, &firstbatch->buf, BT_READ); _bt_freestack(stack); } - if (!BufferIsValid(so->currPos.buf)) + if (!BufferIsValid(firstbatch->buf)) { _bt_parallel_done(scan); return false; @@ -1538,11 +1500,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* position to the precise item on the page */ - offnum = _bt_binsrch(rel, &inskey, so->currPos.buf); + offnum = _bt_binsrch(rel, &inskey, firstbatch->buf); /* * Now load data from the first page of the scan (usually the page - * currently in so->currPos.buf). + * currently in firstbatch.buf). * * If inskey.nextkey = false and inskey.backward = false, offnum is * positioned at the first non-pivot tuple >= inskey.scankeys. @@ -1560,168 +1522,69 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * for the page. For example, when inskey is both < the leaf page's high * key and > all of its non-pivot tuples, offnum will be "maxoff + 1". */ - if (!_bt_readfirstpage(scan, offnum, dir)) - return false; - - _bt_returnitem(scan, so); - return true; + return _bt_readfirstpage(scan, firstbatch, offnum, dir); } /* * _bt_next() -- Get the next item in a scan. * - * On entry, so->currPos describes the current page, which may be pinned - * but is not locked, and so->currPos.itemIndex identifies which item was - * previously returned. + * On entry, priorbatch describes the batch that was last returned by + * btgetbatch. We'll use the prior batch's positioning information to + * decide which page to read next. * - * On success exit, so->currPos is updated as needed, and _bt_returnitem - * sets the next item to return to the scan. so->currPos remains valid. + * On success exit, returns the next batch. There must be at least one + * matching tuple on any returned batch (else we'd just return NULL). * - * On failure exit (no more tuples), we invalidate so->currPos. It'll - * still be possible for the scan to return tuples by changing direction, - * though we'll need to call _bt_first anew in that other direction. + * On failure exit (no more tuples), we return NULL. It'll still be + * possible for the scan to return tuples by changing direction, though + * we'll need to call _bt_first anew in that other direction. */ -bool -_bt_next(IndexScanDesc scan, ScanDirection dir) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - Assert(BTScanPosIsValid(so->currPos)); - - /* - * Advance to next tuple on current page; or if there's no more, try to - * step to the next page with data. - */ - if (ScanDirectionIsForward(dir)) - { - if (++so->currPos.itemIndex > so->currPos.lastItem) - { - if (!_bt_steppage(scan, dir)) - return false; - } - } - else - { - if (--so->currPos.itemIndex < so->currPos.firstItem) - { - if (!_bt_steppage(scan, dir)) - return false; - } - } - - _bt_returnitem(scan, so); - return true; -} - -/* - * Return the index item from so->currPos.items[so->currPos.itemIndex] to the - * index scan by setting the relevant fields in caller's index scan descriptor - */ -static inline void -_bt_returnitem(IndexScanDesc scan, BTScanOpaque so) -{ - BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex]; - - /* Most recent _bt_readpage must have succeeded */ - Assert(BTScanPosIsValid(so->currPos)); - Assert(so->currPos.itemIndex >= so->currPos.firstItem); - Assert(so->currPos.itemIndex <= so->currPos.lastItem); - - /* Return next item, per amgettuple contract */ - scan->xs_heaptid = currItem->heapTid; - if (so->currTuples) - scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); -} - -/* - * _bt_steppage() -- Step to next page containing valid data for scan - * - * Wrapper on _bt_readnextpage that performs final steps for the current page. - * - * On entry, so->currPos must be valid. Its buffer will be pinned, though - * never locked. (Actually, when so->dropPin there won't even be a pin held, - * though so->currPos.currPage must still be set to a valid block number.) - */ -static bool -_bt_steppage(IndexScanDesc scan, ScanDirection dir) +BatchIndexScan +_bt_next(IndexScanDesc scan, ScanDirection dir, BatchIndexScan priorbatch) { BTScanOpaque so = (BTScanOpaque) scan->opaque; BlockNumber blkno, lastcurrblkno; - Assert(BTScanPosIsValid(so->currPos)); - - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - - /* - * Before we modify currPos, make a copy of the page data if there was a - * mark position that needs it. - */ - if (so->markItemIndex >= 0) - { - /* bump pin on current buffer for assignment to mark buffer */ - if (BTScanPosIsPinned(so->currPos)) - IncrBufferRefCount(so->currPos.buf); - memcpy(&so->markPos, &so->currPos, - offsetof(BTScanPosData, items[1]) + - so->currPos.lastItem * sizeof(BTScanPosItem)); - if (so->markTuples) - memcpy(so->markTuples, so->currTuples, - so->currPos.nextTupleOffset); - so->markPos.itemIndex = so->markItemIndex; - so->markItemIndex = -1; - - /* - * If we're just about to start the next primitive index scan - * (possible with a scan that has arrays keys, and needs to skip to - * continue in the current scan direction), moreLeft/moreRight only - * indicate the end of the current primitive index scan. They must - * never be taken to indicate that the top-level index scan has ended - * (that would be wrong). - * - * We could handle this case by treating the current array keys as - * markPos state. But depending on the current array state like this - * would add complexity. Instead, we just unset markPos's copy of - * moreRight or moreLeft (whichever might be affected), while making - * btrestrpos reset the scan's arrays to their initial scan positions. - * In effect, btrestrpos leaves advancing the arrays up to the first - * _bt_readpage call (that takes place after it has restored markPos). - */ - if (so->needPrimScan) - { - if (ScanDirectionIsForward(so->currPos.dir)) - so->markPos.moreRight = true; - else - so->markPos.moreLeft = true; - } - - /* mark/restore not supported by parallel scans */ - Assert(!scan->parallel_scan); - } - - BTScanPosUnpinIfPinned(so->currPos); + Assert(BlockNumberIsValid(priorbatch->currPage)); /* Walk to the next page with data */ if (ScanDirectionIsForward(dir)) - blkno = so->currPos.nextPage; + blkno = priorbatch->nextPage; else - blkno = so->currPos.prevPage; - lastcurrblkno = so->currPos.currPage; + blkno = priorbatch->prevPage; + lastcurrblkno = priorbatch->currPage; /* * Cancel primitive index scans that were scheduled when the call to - * _bt_readpage for currPos happened to use the opposite direction to the - * one that we're stepping in now. (It's okay to leave the scan's array - * keys as-is, since the next _bt_readpage will advance them.) + * _bt_readpage for pos happened to use the opposite direction to the one + * that we're stepping in now. (It's okay to leave the scan's array keys + * as-is, since the next _bt_readpage will advance them.) */ - if (so->currPos.dir != dir) + if (priorbatch->dir != dir) so->needPrimScan = false; + if (blkno == P_NONE || + (ScanDirectionIsForward(dir) ? + !priorbatch->moreRight : !priorbatch->moreLeft)) + { + /* + * priorbatch _bt_readpage call ended scan in this direction (though + * if so->needPrimScan was set the scan will continue in _bt_first) + */ + _bt_parallel_done(scan); + return NULL; + } + + /* parallel scan must seize the scan to get next blkno */ + if (scan->parallel_scan != NULL && + !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) + return NULL; /* done iff so->needPrimScan wasn't set */ + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false); } + /* * _bt_readfirstpage() -- Read first page containing valid data for _bt_first * @@ -1731,73 +1594,90 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * to stop the scan on this page by calling _bt_checkkeys against the high * key. See _bt_readpage for full details. * - * On entry, so->currPos must be pinned and locked (so offnum stays valid). + * On entry, firstbatch must be pinned and locked (so offnum stays valid). * Parallel scan callers must have seized the scan before calling here. * - * On exit, we'll have updated so->currPos and retained locks and pins + * On exit, we'll have updated firstbatch and retained locks and pins * according to the same rules as those laid out for _bt_readnextpage exit. - * Like _bt_readnextpage, our return value indicates if there are any matching - * records in the given direction. * * We always release the scan for a parallel scan caller, regardless of * success or failure; we'll call _bt_parallel_release as soon as possible. */ -static bool -_bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) +static BatchIndexScan +_bt_readfirstpage(IndexScanDesc scan, BatchIndexScan firstbatch, + OffsetNumber offnum, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BlockNumber blkno, + lastcurrblkno; - so->numKilled = 0; /* just paranoia */ - so->markItemIndex = -1; /* ditto */ - - /* Initialize so->currPos for the first page (page in so->currPos.buf) */ + /* Initialize firstbatch's position for the first page */ if (so->needPrimScan) { Assert(so->numArrayKeys); - so->currPos.moreLeft = true; - so->currPos.moreRight = true; + firstbatch->moreLeft = true; + firstbatch->moreRight = true; so->needPrimScan = false; } else if (ScanDirectionIsForward(dir)) { - so->currPos.moreLeft = false; - so->currPos.moreRight = true; + firstbatch->moreLeft = false; + firstbatch->moreRight = true; } else { - so->currPos.moreLeft = true; - so->currPos.moreRight = false; + firstbatch->moreLeft = true; + firstbatch->moreRight = false; } /* * Attempt to load matching tuples from the first page. * - * Note that _bt_readpage will finish initializing the so->currPos fields. + * Note that _bt_readpage will finish initializing the firstbatch fields. * _bt_readpage also releases parallel scan (even when it returns false). */ - if (_bt_readpage(scan, dir, offnum, true)) + if (_bt_readpage(scan, firstbatch, dir, offnum, true)) { - Relation rel = scan->indexRelation; - - /* - * _bt_readpage succeeded. Drop the lock (and maybe the pin) on - * so->currPos.buf in preparation for btgettuple returning tuples. - */ - Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(rel, so); - return true; + /* _bt_readpage saved one or more matches in firstbatch.items[] */ + indexam_util_batch_unlock(scan, firstbatch); + return firstbatch; } - /* There's no actually-matching data on the page in so->currPos.buf */ - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + /* There's no actually-matching data on the page */ + _bt_relbuf(scan->indexRelation, firstbatch->buf); + firstbatch->buf = InvalidBuffer; - /* Call _bt_readnextpage using its _bt_steppage wrapper function */ - if (!_bt_steppage(scan, dir)) - return false; + /* Walk to the next page with data */ + if (ScanDirectionIsForward(dir)) + blkno = firstbatch->nextPage; + else + blkno = firstbatch->prevPage; + lastcurrblkno = firstbatch->currPage; - /* _bt_readpage for a later page (now in so->currPos) succeeded */ - return true; + Assert(firstbatch->dir == dir); + + if (blkno == P_NONE || + (ScanDirectionIsForward(dir) ? + !firstbatch->moreRight : !firstbatch->moreLeft)) + { + /* + * firstbatch _bt_readpage call ended scan in this direction (though + * if so->needPrimScan was set the scan will continue in _bt_first) + */ + indexam_util_batch_release(scan, firstbatch); + _bt_parallel_done(scan); + return NULL; + } + + indexam_util_batch_release(scan, firstbatch); + + /* parallel scan must seize the scan to get next blkno */ + if (scan->parallel_scan != NULL && + !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) + return NULL; /* done iff so->needPrimScan wasn't set */ + + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false); } /* @@ -1807,102 +1687,69 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * previously-saved right link or left link. lastcurrblkno is the page that * was current at the point where the blkno link was saved, which we use to * reason about concurrent page splits/page deletions during backwards scans. - * In the common case where seized=false, blkno is either so->currPos.nextPage - * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage. + * blkno is the prior scan position's nextPage or prevPage (depending on scan + * direction), and lastcurrblkno is the prior position's currPage. * - * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must - * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno - * won't need to be read again in almost all cases). Parallel scan callers - * that seized the scan before calling here should pass seized=true; such a - * caller's blkno and lastcurrblkno arguments come from the seized scan. - * seized=false callers just pass us the blkno/lastcurrblkno taken from their - * so->currPos, which (along with so->currPos itself) can be used to end the - * scan. A seized=false caller's blkno can never be assumed to be the page - * that must be read next during a parallel scan, though. We must figure that - * part out for ourselves by seizing the scan (the correct page to read might - * already be beyond the seized=false caller's blkno during a parallel scan, - * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE, - * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset). + * On entry, no page should be locked by caller. * - * On success exit, so->currPos is updated to contain data from the next - * interesting page, and we return true. We hold a pin on the buffer on - * success exit (except during so->dropPin index scans, when we drop the pin - * eagerly to avoid blocking VACUUM). + * On success exit, returns scan batch containing data from the next + * interesting page. We hold a pin on the buffer on success exit (except + * during dropPin plain index scans, when we drop the pin eagerly to avoid + * blocking VACUUM). If there are no more matching records in the given + * direction, we just return NULL. * - * If there are no more matching records in the given direction, we invalidate - * so->currPos (while ensuring it retains no locks or pins), and return false. - * - * We always release the scan for a parallel scan caller, regardless of - * success or failure; we'll call _bt_parallel_release as soon as possible. + * Parallel scan callers must seize the scan before calling here. blkno and + * lastcurrblkno should come from the seized scan. We'll release the scan as + * soon as possible. */ -static bool +static BatchIndexScan _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, - BlockNumber lastcurrblkno, ScanDirection dir, bool seized) + BlockNumber lastcurrblkno, ScanDirection dir, bool firstpage) { Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; + BatchIndexScan newbatch; - Assert(so->currPos.currPage == lastcurrblkno || seized); - Assert(!(blkno == P_NONE && seized)); - Assert(!BTScanPosIsPinned(so->currPos)); + /* Allocate space for next batch */ + newbatch = indexam_util_batch_alloc(scan, MaxTIDsPerBTreePage, + scan->xs_want_itup); /* - * Remember that the scan already read lastcurrblkno, a page to the left - * of blkno (or remember reading a page to the right, for backwards scans) + * pos is the first valid page to the right (or to the left) of + * lastcurrblkno. Also provisionally assume that there'll be another page + * we'll need to the right (or to the left) ahead of _bt_readpage call. */ - if (ScanDirectionIsForward(dir)) - so->currPos.moreLeft = true; - else - so->currPos.moreRight = true; + newbatch->moreLeft = true; + newbatch->moreRight = true; for (;;) { Page page; BTPageOpaque opaque; - if (blkno == P_NONE || - (ScanDirectionIsForward(dir) ? - !so->currPos.moreRight : !so->currPos.moreLeft)) - { - /* most recent _bt_readpage call (for lastcurrblkno) ended scan */ - Assert(so->currPos.currPage == lastcurrblkno && !seized); - BTScanPosInvalidate(so->currPos); - _bt_parallel_done(scan); /* iff !so->needPrimScan */ - return false; - } - - Assert(!so->needPrimScan); - - /* parallel scan must never actually visit so->currPos blkno */ - if (!seized && scan->parallel_scan != NULL && - !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) - { - /* whole scan is now done (or another primitive scan required) */ - BTScanPosInvalidate(so->currPos); - return false; - } + Assert(!((BTScanOpaque) scan->opaque)->needPrimScan); + Assert(blkno != P_NONE && lastcurrblkno != P_NONE); if (ScanDirectionIsForward(dir)) { /* read blkno, but check for interrupts first */ CHECK_FOR_INTERRUPTS(); - so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + newbatch->buf = _bt_getbuf(rel, blkno, BT_READ); } else { /* read blkno, avoiding race (also checks for interrupts) */ - so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno, - lastcurrblkno); - if (so->currPos.buf == InvalidBuffer) + newbatch->buf = _bt_lock_and_validate_left(rel, &blkno, + lastcurrblkno); + if (newbatch->buf == InvalidBuffer) { /* must have been a concurrent deletion of leftmost page */ - BTScanPosInvalidate(so->currPos); _bt_parallel_done(scan); - return false; + indexam_util_batch_release(scan, newbatch); + return NULL; } } - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(newbatch->buf); opaque = BTPageGetOpaque(page); lastcurrblkno = blkno; if (likely(!P_IGNORE(opaque))) @@ -1910,17 +1757,17 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, /* see if there are any matches on this page */ if (ScanDirectionIsForward(dir)) { - /* note that this will clear moreRight if we can stop */ - if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized)) + if (_bt_readpage(scan, newbatch, dir, + P_FIRSTDATAKEY(opaque), firstpage)) break; - blkno = so->currPos.nextPage; + blkno = newbatch->nextPage; } else { - /* note that this will clear moreLeft if we can stop */ - if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized)) + if (_bt_readpage(scan, newbatch, dir, + PageGetMaxOffsetNumber(page), firstpage)) break; - blkno = so->currPos.prevPage; + blkno = newbatch->prevPage; } } else @@ -1935,19 +1782,39 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, } /* no matching tuples on this page */ - _bt_relbuf(rel, so->currPos.buf); - seized = false; /* released by _bt_readpage (or by us) */ + _bt_relbuf(rel, newbatch->buf); + newbatch->buf = InvalidBuffer; + + /* Continue the scan in this direction? */ + if (blkno == P_NONE || + (ScanDirectionIsForward(dir) ? + !newbatch->moreRight : !newbatch->moreLeft)) + { + /* + * blkno _bt_readpage call ended scan in this direction (though if + * so->needPrimScan was set the scan will continue in _bt_first) + */ + _bt_parallel_done(scan); + indexam_util_batch_release(scan, newbatch); + return NULL; + } + + /* parallel scan must seize the scan to get next blkno */ + if (scan->parallel_scan != NULL && + !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) + { + indexam_util_batch_release(scan, newbatch); + return NULL; /* done iff so->needPrimScan wasn't set */ + } + + firstpage = false; /* next page cannot be first */ } - /* - * _bt_readpage succeeded. Drop the lock (and maybe the pin) on - * so->currPos.buf in preparation for btgettuple returning tuples. - */ - Assert(so->currPos.currPage == blkno); - Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(rel, so); + /* _bt_readpage saved one or more matches in newbatch.items[] */ + Assert(newbatch->currPage == blkno); + indexam_util_batch_unlock(scan, newbatch); - return true; + return newbatch; } /* @@ -2170,25 +2037,23 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) * Parallel scan callers must have seized the scan before calling here. * Exit conditions are the same as for _bt_first(). */ -static bool -_bt_endpoint(IndexScanDesc scan, ScanDirection dir) +static BatchIndexScan +_bt_endpoint(IndexScanDesc scan, ScanDirection dir, BatchIndexScan firstbatch) { Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber start; - Assert(!BTScanPosIsValid(so->currPos)); - Assert(!so->needPrimScan); + Assert(!((BTScanOpaque) scan->opaque)->needPrimScan); /* * Scan down to the leftmost or rightmost leaf page. This is a simplified * version of _bt_search(). */ - so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); + firstbatch->buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); - if (!BufferIsValid(so->currPos.buf)) + if (!BufferIsValid(firstbatch->buf)) { /* * Empty index. Lock the whole relation, as nothing finer to lock @@ -2199,7 +2064,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return false; } - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(firstbatch->buf); opaque = BTPageGetOpaque(page); Assert(P_ISLEAF(opaque)); @@ -2225,9 +2090,5 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) /* * Now load data from the first page of the scan. */ - if (!_bt_readfirstpage(scan, start, dir)) - return false; - - _bt_returnitem(scan, so); - return true; + return _bt_readfirstpage(scan, firstbatch, start, dir); } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index a451d48e1..6ec563c95 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -176,88 +176,84 @@ _bt_compare_int(const void *va, const void *vb) * _bt_killitems - set LP_DEAD state for items an indexscan caller has * told us were killed * - * scan->opaque, referenced locally through so, contains information about the - * current page and killed tuples thereon (generally, this should only be - * called if so->numKilled > 0). + * The batch parameter contains information about the current page and killed + * tuples thereon (this should only be called if batch->numKilled > 0). * - * Caller should not have a lock on the so->currPos page, but must hold a - * buffer pin when !so->dropPin. When we return, it still won't be locked. - * It'll continue to hold whatever pins were held before calling here. + * Caller should not have a lock on the batch position's page, but must hold a + * buffer pin when !dropPin. When we return, it still won't be locked. It'll + * continue to hold whatever pins were held before calling here. * * We match items by heap TID before assuming they are the right ones to set * LP_DEAD. If the scan is one that holds a buffer pin on the target page * continuously from initially reading the items until applying this function - * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the + * (if it is a !dropPin scan), VACUUM cannot have deleted any items on the * page, so the page's TIDs can't have been recycled by now. There's no risk * that we'll confuse a new index tuple that happens to use a recycled TID * with a now-removed tuple with the same TID (that used to be on this same * page). We can't rely on that during scans that drop buffer pins eagerly - * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on + * (i.e. dropPin scans), though, so we must condition setting LP_DEAD bits on * the page LSN having not changed since back when _bt_readpage saw the page. * We totally give up on setting LP_DEAD bits when the page LSN changed. * - * We give up much less often during !so->dropPin scans, but it still happens. + * We tend to give up less often during !dropPin scans, but it still happens. * We cope with cases where items have moved right due to insertions. If an * item has moved off the current page due to a split, we'll fail to find it * and just give up on it. */ void -_bt_killitems(IndexScanDesc scan) +_bt_killitems(IndexScanDesc scan, BatchIndexScan batch) { Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; - int numKilled = so->numKilled; + int numKilled = batch->numKilled; bool killedsomething = false; Buffer buf; Assert(numKilled > 0); - Assert(BTScanPosIsValid(so->currPos)); + Assert(BlockNumberIsValid(batch->currPage)); Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */ - /* Always invalidate so->killedItems[] before leaving so->currPos */ - so->numKilled = 0; + /* Always invalidate batch->killedItems[] before freeing batch */ + batch->numKilled = 0; /* - * We need to iterate through so->killedItems[] in leaf page order; the + * We need to iterate through batch.killedItems[] in leaf page order; the * loop below expects this (when marking posting list tuples, at least). - * so->killedItems[] is now in whatever order the scan returned items in. + * killedItems[] is now in whatever order the scan returned items in. * Scrollable cursor scans might have even saved the same item/TID twice. * - * Sort and unique-ify so->killedItems[] to deal with all this. + * Sort and unique-ify batch.killedItems[] to deal with all this. */ if (numKilled > 1) { - qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int); - numKilled = qunique(so->killedItems, numKilled, sizeof(int), + qsort(batch->killedItems, numKilled, sizeof(int), _bt_compare_int); + numKilled = qunique(batch->killedItems, numKilled, sizeof(int), _bt_compare_int); } - if (!so->dropPin) + if (!scan->batchqueue->dropPin) { /* * We have held the pin on this page since we read the index tuples, * so all we need to do is lock it. The pin will have prevented * concurrent VACUUMs from recycling any of the TIDs on the page. */ - Assert(BTScanPosIsPinned(so->currPos)); - buf = so->currPos.buf; + buf = batch->buf; _bt_lockbuf(rel, buf, BT_READ); } else { XLogRecPtr latestlsn; - Assert(!BTScanPosIsPinned(so->currPos)); Assert(RelationNeedsWAL(rel)); - buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); + buf = _bt_getbuf(rel, batch->currPage, BT_READ); latestlsn = BufferGetLSNAtomic(buf); - Assert(so->currPos.lsn <= latestlsn); - if (so->currPos.lsn != latestlsn) + Assert(batch->lsn <= latestlsn); + if (batch->lsn != latestlsn) { /* Modified, give up on hinting */ _bt_relbuf(rel, buf); @@ -272,17 +268,16 @@ _bt_killitems(IndexScanDesc scan) minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); - /* Iterate through so->killedItems[] in leaf page order */ + /* Iterate through batch->killedItems[] in leaf page order */ for (int i = 0; i < numKilled; i++) { - int itemIndex = so->killedItems[i]; - BTScanPosItem *kitem = &so->currPos.items[itemIndex]; + int itemIndex = batch->killedItems[i]; + BatchMatchingItem *kitem = &batch->items[itemIndex]; OffsetNumber offnum = kitem->indexOffset; - Assert(itemIndex >= so->currPos.firstItem && - itemIndex <= so->currPos.lastItem); + Assert(itemIndex >= batch->firstItem && itemIndex <= batch->lastItem); Assert(i == 0 || - offnum >= so->currPos.items[so->killedItems[i - 1]].indexOffset); + offnum >= batch->items[batch->killedItems[i - 1]].indexOffset); if (offnum < minoff) continue; /* pure paranoia */ @@ -300,7 +295,7 @@ _bt_killitems(IndexScanDesc scan) /* * Note that the page may have been modified in almost any way - * since we first read it (in the !so->dropPin case), so it's + * since we first read it (in the !dropPin case), so it's * possible that this posting list tuple wasn't a posting list * tuple when we first encountered its heap TIDs. */ @@ -316,7 +311,8 @@ _bt_killitems(IndexScanDesc scan) * though only in the common case where the page can't * have been concurrently modified */ - Assert(kitem->indexOffset == offnum || !so->dropPin); + Assert(kitem->indexOffset == offnum || + !scan->batchqueue->dropPin); /* * Read-ahead to later kitems here. @@ -333,7 +329,7 @@ _bt_killitems(IndexScanDesc scan) * correctly -- posting tuple still gets killed). */ if (pi < numKilled) - kitem = &so->currPos.items[so->killedItems[pi++]]; + kitem = &batch->items[batch->killedItems[pi++]]; } /* @@ -383,7 +379,7 @@ _bt_killitems(IndexScanDesc scan) MarkBufferDirtyHint(buf, true); } - if (!so->dropPin) + if (!scan->batchqueue->dropPin) _bt_unlockbuf(rel, buf); else _bt_relbuf(rel, buf); diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index a60ec85e8..b0a6e0974 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -88,10 +88,11 @@ spghandler(PG_FUNCTION_ARGS) amroutine->ambeginscan = spgbeginscan; amroutine->amrescan = spgrescan; amroutine->amgettuple = spggettuple; + amroutine->amgetbatch = NULL; + amroutine->amfreebatch = NULL; amroutine->amgetbitmap = spggetbitmap; amroutine->amendscan = spgendscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; + amroutine->amposreset = NULL; amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 1e099febd..a7341bcaf 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -248,7 +248,7 @@ table_index_fetch_tuple_check(Relation rel, bool found; slot = table_slot_create(rel, NULL); - scan = table_index_fetch_begin(rel); + scan = table_index_fetch_begin(rel, NULL); found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, all_dead); table_index_fetch_end(scan); diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 3497a8221..8a5d79a27 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -106,7 +106,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) */ tmptid = checktid; { - IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation); + IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation, + NULL); bool call_again = false; if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index d9cccb6ac..3ac91f12e 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -880,7 +880,7 @@ DefineIndex(Oid tableId, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("access method \"%s\" does not support multicolumn indexes", accessMethodName))); - if (exclusion && amRoutine->amgettuple == NULL) + if (exclusion && amRoutine->amgettuple == NULL && amRoutine->amgetbatch == NULL) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("access method \"%s\" does not support exclusion constraints", diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 1d0e8ad57..ac337d900 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -428,7 +428,7 @@ ExecSupportsMarkRestore(Path *pathnode) case T_IndexOnlyScan: /* - * Not all index types support mark/restore. + * Not all index types support restoring a mark */ return castNode(IndexPath, pathnode)->indexinfo->amcanmarkpos; diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 0b3a31f17..891a5b2c9 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -816,10 +816,12 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, retry: conflict = false; found_self = false; - index_scan = index_beginscan(heap, index, &DirtySnapshot, NULL, indnkeyatts, 0); + index_scan = index_beginscan(heap, index, NULL, &DirtySnapshot, NULL, + indnkeyatts, 0); index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0); - while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot)) + while (table_index_getnext_slot(index_scan, ForwardScanDirection, + existing_slot)) { TransactionId xwait; XLTW_Oper reason_wait; diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 860f79f9c..9236da9b2 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -204,7 +204,7 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot); /* Start an index scan. */ - scan = index_beginscan(rel, idxrel, &snap, NULL, skey_attoff, 0); + scan = index_beginscan(rel, idxrel, NULL, &snap, NULL, skey_attoff, 0); retry: found = false; @@ -212,7 +212,7 @@ retry: index_rescan(scan, skey, skey_attoff, NULL, 0); /* Try to find the tuple */ - while (index_getnext_slot(scan, ForwardScanDirection, outslot)) + while (table_index_getnext_slot(scan, ForwardScanDirection, outslot)) { /* * Avoid expensive equality check if the index is primary key or @@ -665,12 +665,12 @@ RelationFindDeletedTupleInfoByIndex(Relation rel, Oid idxoid, * not yet committed or those just committed prior to the scan are * excluded in update_most_recent_deletion_info(). */ - scan = index_beginscan(rel, idxrel, SnapshotAny, NULL, skey_attoff, 0); + scan = index_beginscan(rel, idxrel, NULL, SnapshotAny, NULL, skey_attoff, 0); index_rescan(scan, skey, skey_attoff, NULL, 0); /* Try to find the tuple */ - while (index_getnext_slot(scan, ForwardScanDirection, scanslot)) + while (table_index_getnext_slot(scan, ForwardScanDirection, scanslot)) { /* * Avoid expensive equality check if the index is primary key or diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 6bea42f12..96778abe3 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -34,14 +34,12 @@ #include "access/relscan.h" #include "access/tableam.h" #include "access/tupdesc.h" -#include "access/visibilitymap.h" #include "catalog/pg_type.h" #include "executor/executor.h" #include "executor/nodeIndexonlyscan.h" #include "executor/nodeIndexscan.h" #include "miscadmin.h" #include "storage/bufmgr.h" -#include "storage/predicate.h" #include "utils/builtins.h" #include "utils/rel.h" @@ -65,7 +63,6 @@ IndexOnlyNext(IndexOnlyScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; - ItemPointer tid; /* * extract necessary information from index scan node @@ -91,6 +88,7 @@ IndexOnlyNext(IndexOnlyScanState *node) */ scandesc = index_beginscan(node->ss.ss_currentRelation, node->ioss_RelationDesc, + node->ioss_TableSlot, estate->es_snapshot, &node->ioss_Instrument, node->ioss_NumScanKeys, @@ -101,7 +99,6 @@ IndexOnlyNext(IndexOnlyScanState *node) /* Set it up for index-only scan */ node->ioss_ScanDesc->xs_want_itup = true; - node->ioss_VMBuffer = InvalidBuffer; /* * If no run-time keys to calculate or they are ready, go ahead and @@ -118,77 +115,12 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (table_index_getnext_slot(scandesc, direction, slot)) { - bool tuple_from_heap = false; - CHECK_FOR_INTERRUPTS(); - /* - * We can skip the heap fetch if the TID references a heap page on - * which all tuples are known visible to everybody. In any case, - * we'll use the index tuple not the heap tuple as the data source. - * - * Note on Memory Ordering Effects: visibilitymap_get_status does not - * lock the visibility map buffer, and therefore the result we read - * here could be slightly stale. However, it can't be stale enough to - * matter. - * - * We need to detect clearing a VM bit due to an insert right away, - * because the tuple is present in the index page but not visible. The - * reading of the TID by this scan (using a shared lock on the index - * buffer) is serialized with the insert of the TID into the index - * (using an exclusive lock on the index buffer). Because the VM bit - * is cleared before updating the index, and locking/unlocking of the - * index page acts as a full memory barrier, we are sure to see the - * cleared bit if we see a recently-inserted TID. - * - * Deletes do not update the index page (only VACUUM will clear out - * the TID), so the clearing of the VM bit by a delete is not - * serialized with this test below, and we may see a value that is - * significantly stale. However, we don't care about the delete right - * away, because the tuple is still visible until the deleting - * transaction commits or the statement ends (if it's our - * transaction). In either case, the lock on the VM buffer will have - * been released (acting as a write barrier) after clearing the bit. - * And for us to have a snapshot that includes the deleting - * transaction (making the tuple invisible), we must have acquired - * ProcArrayLock after that time, acting as a read barrier. - * - * It's worth going through this complexity to avoid needing to lock - * the VM buffer, which could cause significant contention. - */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) - { - /* - * Rats, we have to visit the heap to check visibility. - */ - InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) - continue; /* no visible tuple, try next index entry */ - - ExecClearTuple(node->ioss_TableSlot); - - /* - * Only MVCC snapshots are supported here, so there should be no - * need to keep following the HOT chain once a visible entry has - * been found. If we did want to allow that, we'd need to keep - * more state to remember not to call index_getnext_tid next time. - */ - if (scandesc->xs_heap_continue) - elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); - - /* - * Note: at this point we are holding a pin on the heap page, as - * recorded in scandesc->xs_cbuf. We could release that pin now, - * but it's not clear whether it's a win to do so. The next index - * entry might require a visit to the same heap page. - */ - - tuple_from_heap = true; - } + InstrCountTuples2(node, scandesc->xs_heapfetch->nheapaccesses); + scandesc->xs_heapfetch->nheapaccesses = 0; /* * Fill the scan tuple slot with data from the index. This might be @@ -238,19 +170,13 @@ IndexOnlyNext(IndexOnlyScanState *node) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("lossy distance functions are not supported in index-only scans"))); - - /* - * If we didn't access the heap, then we'll need to take a predicate - * lock explicitly, as if we had. For now we do that at page level. - */ - if (!tuple_from_heap) - PredicateLockPage(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - estate->es_snapshot); - return slot; } + /* XXX This is ugly, but not clear how to do better */ + InstrCountTuples2(node, scandesc->xs_heapfetch->nheapaccesses); + scandesc->xs_heapfetch->nheapaccesses = 0; + /* * if we get here it means the index scan failed so we are at the end of * the scan.. @@ -407,13 +333,6 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node) indexRelationDesc = node->ioss_RelationDesc; indexScanDesc = node->ioss_ScanDesc; - /* Release VM buffer pin, if any. */ - if (node->ioss_VMBuffer != InvalidBuffer) - { - ReleaseBuffer(node->ioss_VMBuffer); - node->ioss_VMBuffer = InvalidBuffer; - } - /* * When ending a parallel worker, copy the statistics gathered by the * worker back into shared memory so that it can be picked up by the main @@ -785,12 +704,12 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, node->ioss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->ioss_RelationDesc, + node->ioss_TableSlot, &node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, piscan); node->ioss_ScanDesc->xs_want_itup = true; - node->ioss_VMBuffer = InvalidBuffer; /* * If no run-time keys to calculate or they are ready, go ahead and pass @@ -851,6 +770,7 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, node->ioss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->ioss_RelationDesc, + node->ioss_TableSlot, &node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 72b135e5d..fdf8bcbc2 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -107,7 +107,7 @@ IndexNext(IndexScanState *node) * serially executing an index scan that was planned to be parallel. */ scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, + node->iss_RelationDesc, NULL, estate->es_snapshot, &node->iss_Instrument, node->iss_NumScanKeys, @@ -128,7 +128,7 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while (table_index_getnext_slot(scandesc, direction, slot)) { CHECK_FOR_INTERRUPTS(); @@ -203,7 +203,7 @@ IndexNextWithReorder(IndexScanState *node) * serially executing an index scan that was planned to be parallel. */ scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, + node->iss_RelationDesc, NULL, estate->es_snapshot, &node->iss_Instrument, node->iss_NumScanKeys, @@ -260,7 +260,7 @@ IndexNextWithReorder(IndexScanState *node) * Fetch next tuple from the index. */ next_indextuple: - if (!index_getnext_slot(scandesc, ForwardScanDirection, slot)) + if (!table_index_getnext_slot(scandesc, ForwardScanDirection, slot)) { /* * No more tuples from the index. But we still need to drain any @@ -1719,7 +1719,7 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, - node->iss_RelationDesc, + node->iss_RelationDesc, NULL, &node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, @@ -1783,7 +1783,7 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, - node->iss_RelationDesc, + node->iss_RelationDesc, NULL, &node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 5d4f81ee7..83681a6f3 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -45,7 +45,7 @@ /* Whether we are looking for plain indexscan, bitmap scan, or either */ typedef enum { - ST_INDEXSCAN, /* must support amgettuple */ + ST_INDEXSCAN, /* must support amgettuple or amgetbatch */ ST_BITMAPSCAN, /* must support amgetbitmap */ ST_ANYSCAN, /* either is okay */ } ScanTypeControl; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index e553afb7f..ac89c5ee3 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -313,11 +313,11 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->amsearcharray = amroutine->amsearcharray; info->amsearchnulls = amroutine->amsearchnulls; info->amcanparallel = amroutine->amcanparallel; - info->amhasgettuple = (amroutine->amgettuple != NULL); + info->amhasgettuple = (amroutine->amgettuple != NULL || + amroutine->amgetbatch != NULL); info->amhasgetbitmap = amroutine->amgetbitmap != NULL && relation->rd_tableam->scan_bitmap_next_tuple != NULL; - info->amcanmarkpos = (amroutine->ammarkpos != NULL && - amroutine->amrestrpos != NULL); + info->amcanmarkpos = amroutine->amposreset != NULL; info->amcostestimate = amroutine->amcostestimate; Assert(info->amcostestimate != NULL); diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c index 2c8485b88..4a74b69df 100644 --- a/src/backend/replication/logical/relation.c +++ b/src/backend/replication/logical/relation.c @@ -889,7 +889,8 @@ IsIndexUsableForReplicaIdentityFull(Relation idxrel, AttrMap *attrmap) * The given index access method must implement "amgettuple", which will * be used later to fetch the tuples. See RelationFindReplTupleByIndex(). */ - if (GetIndexAmRoutineByAmId(idxrel->rd_rel->relam, false)->amgettuple == NULL) + if (GetIndexAmRoutineByAmId(idxrel->rd_rel->relam, false)->amgettuple == NULL && + GetIndexAmRoutineByAmId(idxrel->rd_rel->relam, false)->amgetbatch == NULL) return false; return true; diff --git a/src/backend/utils/adt/amutils.c b/src/backend/utils/adt/amutils.c index 0af26d6ac..1ebe0a76a 100644 --- a/src/backend/utils/adt/amutils.c +++ b/src/backend/utils/adt/amutils.c @@ -363,7 +363,7 @@ indexam_property(FunctionCallInfo fcinfo, PG_RETURN_BOOL(routine->amclusterable); case AMPROP_INDEX_SCAN: - PG_RETURN_BOOL(routine->amgettuple ? true : false); + PG_RETURN_BOOL(routine->amgettuple || routine->amgetbatch ? true : false); case AMPROP_BITMAP_SCAN: PG_RETURN_BOOL(routine->amgetbitmap ? true : false); @@ -392,7 +392,7 @@ indexam_property(FunctionCallInfo fcinfo, PG_RETURN_BOOL(routine->amcanmulticol); case AMPROP_CAN_EXCLUDE: - PG_RETURN_BOOL(routine->amgettuple ? true : false); + PG_RETURN_BOOL(routine->amgettuple || routine->amgetbatch ? true : false); case AMPROP_CAN_INCLUDE: PG_RETURN_BOOL(routine->amcaninclude); diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index c760b19db..7bba223c0 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -102,7 +102,6 @@ #include "access/gin.h" #include "access/table.h" #include "access/tableam.h" -#include "access/visibilitymap.h" #include "catalog/pg_collation.h" #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" @@ -7043,10 +7042,6 @@ get_actual_variable_endpoint(Relation heapRel, bool have_data = false; SnapshotData SnapshotNonVacuumable; IndexScanDesc index_scan; - Buffer vmbuffer = InvalidBuffer; - BlockNumber last_heap_block = InvalidBlockNumber; - int n_visited_heap_pages = 0; - ItemPointer tid; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; MemoryContext oldcontext; @@ -7098,7 +7093,7 @@ get_actual_variable_endpoint(Relation heapRel, InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(heapRel)); - index_scan = index_beginscan(heapRel, indexRel, + index_scan = index_beginscan(heapRel, indexRel, tableslot, &SnapshotNonVacuumable, NULL, 1, 0); /* Set it up for index-only scan */ @@ -7106,48 +7101,20 @@ get_actual_variable_endpoint(Relation heapRel, index_rescan(index_scan, scankeys, 1, NULL, 0); /* Fetch first/next tuple in specified direction */ - while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) + while (table_index_getnext_slot(index_scan, indexscandir, tableslot)) { - BlockNumber block = ItemPointerGetBlockNumber(tid); + /* We don't actually need the heap tuple for anything */ + ExecClearTuple(tableslot); - if (!VM_ALL_VISIBLE(heapRel, - block, - &vmbuffer)) - { - /* Rats, we have to visit the heap to check visibility */ - if (!index_fetch_heap(index_scan, tableslot)) - { - /* - * No visible tuple for this index entry, so we need to - * advance to the next entry. Before doing so, count heap - * page fetches and give up if we've done too many. - * - * We don't charge a page fetch if this is the same heap page - * as the previous tuple. This is on the conservative side, - * since other recently-accessed pages are probably still in - * buffers too; but it's good enough for this heuristic. - */ + /* + * No visible tuple for this index entry, so we need to advance to the + * next entry. Before doing so, count heap page fetches and give up + * if we've done too many. + */ #define VISITED_PAGES_LIMIT 100 - if (block != last_heap_block) - { - last_heap_block = block; - n_visited_heap_pages++; - if (n_visited_heap_pages > VISITED_PAGES_LIMIT) - break; - } - - continue; /* no visible tuple, try next index entry */ - } - - /* We don't actually need the heap tuple for anything */ - ExecClearTuple(tableslot); - - /* - * We don't care whether there's more than one visible tuple in - * the HOT chain; if any are visible, that's good enough. - */ - } + if (index_scan->xs_heapfetch->nheapaccesses > VISITED_PAGES_LIMIT) + break; /* * We expect that the index will return data in IndexTuple not @@ -7180,8 +7147,6 @@ get_actual_variable_endpoint(Relation heapRel, break; } - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); index_endscan(index_scan); return have_data; diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 7a468b4a1..0c3ba684a 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -148,8 +148,7 @@ blhandler(PG_FUNCTION_ARGS) amroutine->amgettuple = NULL; amroutine->amgetbitmap = blgetbitmap; amroutine->amendscan = blendscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; + amroutine->amposreset = NULL; amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 63d7e376f..cdc9c65a6 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -161,10 +161,11 @@ typedef struct IndexAmRoutine ambeginscan_function ambeginscan; amrescan_function amrescan; amgettuple_function amgettuple; /* can be NULL */ + amgetbatch_function amgetbatch; /* can be NULL */ + amfreebatch_function amfreebatch; /* can be NULL */ amgetbitmap_function amgetbitmap; /* can be NULL */ amendscan_function amendscan; - ammarkpos_function ammarkpos; /* can be NULL */ - amrestrpos_function amrestrpos; /* can be NULL */ + amposreset_function amposreset; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ @@ -743,6 +744,137 @@ amgettuple (IndexScanDesc scan, amgettuple field in its IndexAmRoutine struct must be set to NULL. + + + As of PostgreSQL version 19, position marking + and restoration of scans is no longer supported for the + amgettuple interface; only the + amgetbatch interface supports this feature through + the amposreset callback. + + + + + +BatchIndexScan +amgetbatch (IndexScanDesc scan, + BatchIndexScan priorbatch, + ScanDirection direction); + + Return the next batch of index tuples in the given scan, moving in the + given direction (forward or backward in the index). Returns an instance of + BatchIndexScan with index tuples loaded, or + NULL if there are no more index tuples. + + + + The priorbatch parameter passes the batch previously + returned by an earlier amgetbatch call (or + NULL on the first call). The index AM uses + priorbatch to determine which index page to read next, + typically by following page links found in priorbatch. + The returned batch contains matching items immediately adjacent to those + from priorbatch in the common case where + priorbatch is the batch that was returned by the most + recent call to amgetbatch call (though not when the + most recent call used the opposite scan direction to this call, and not + when a mark has been restored). + + + + A batch returned by amgetbatch is guaranteed to be + associated with an index page containing at least one matching tuple. + The index page associated with the batch may be retained in a buffer with + its pin held as an interlock against concurrent TID recycling by + VACUUM. See for details + on buffer pin management during plain index scans. + + + + The amgetbatch interface does not support index-only + scans that return data via the xs_hitup mechanism. + Index-only scans are supported through the xs_itup + mechanism only. + + + + The amgetbatch function need only be provided if the + access method supports plain index scans. If it doesn't, + the amgetbatch field in its + IndexAmRoutine struct must be set to NULL. + + + + A BatchIndexScan that is returned by + amgetbatch is no longer managed by the access method. + It is up to the table AM caller to decide when it should be freed by + passing it to amfreebatch. Note also that + amgetbatch functions must never modify the + priorbatch parameter. + + + + The access method may provide only one of amgettuple + and amgetbatch callbacks, not both (XXX uncertain). + When the access method provides amgetbatch, it must + also provide amfreebatch. + + + + The same caveats described for amgettuple apply here + too: an entry in the returned batch means only that the index contains + an entry that matches the scan keys, not that the tuple necessarily still + exists in the heap or will pass the caller's snapshot test. + + + + +void +amfreebatch (IndexScanDesc scan, + BatchIndexScan batch); + + Releases a batch returned by the amgetbatch callback. + This function is called exclusively by table access methods to indicate + that processing of the batch is complete; it should never be called within + the index access method itself. + + + + amfreebatch frees buffer pins held on the batch's + associated index page and releases related memory and resources. These + buffer pins serve as an interlock against concurrent TID recycling by + VACUUM, protecting the table access method from confusion + about which TID corresponds to which logical row. See for detailed discussion of buffer pin management. + + + + The index AM may choose to retain its own buffer pins across multiple + amfreebatch calls when this serves an internal purpose + (for example, maintaining a descent stack of pinned index pages for reuse + across amgetbatch calls). However, any scheme that + retains buffer pins must keep the number of retained pins fixed and small, + to avoid exhausting the backend's buffer pin limit. + + + + The index AM has the option of setting LP_DEAD bits in + the index page to mark dead tuples before releasing the buffer pin. When + BatchQueue.dropPin is true and the buffer pin is being + dropped eagerly, the index AM must check BatchIndexScan.lsn + to verify that the page LSN has not advanced since the batch was originally + read before setting LP_DEAD bits, to avoid concurrent + TID recycling hazards. When scan->batchqueue->dropPin + is false (requiring that a buffer pin be held throughout first reading the + index leaf page and calling amfreebatch), + LP_DEAD bits can always be set safely without an LSN check. + + + + The amfreebatch function need only be provided if the + access method provides amgetbatch. Otherwise it has to + remain set to NULL. + @@ -762,8 +894,8 @@ amgetbitmap (IndexScanDesc scan, itself, and therefore callers recheck both the scan conditions and the partial index predicate (if any) for recheckable tuples. That might not always be true, however. - amgetbitmap and - amgettuple cannot be used in the same index scan; there + Only one of amgetbitmap, amgettuple, + or amgetbatch can be used in any given index scan; there are other restrictions too when using amgetbitmap, as explained in . @@ -789,32 +921,25 @@ amendscan (IndexScanDesc scan); void -ammarkpos (IndexScanDesc scan); +amposreset (IndexScanDesc scan); - Mark current scan position. The access method need only support one - remembered scan position per scan. + Notify index AM that core code will change the scan's position to an item + returned as part of an earlier batch. The index AM must therefore + invalidate any state that independently tracks the scan's progress + (e.g., array keys used with a ScalarArrayOpExpr qual). Called by the core + system when it is about to restore a mark. - The ammarkpos function need only be provided if the access - method supports ordered scans. If it doesn't, - the ammarkpos field in its IndexAmRoutine - struct may be set to NULL. - - - - -void -amrestrpos (IndexScanDesc scan); - - Restore the scan to the most recently marked position. - - - - The amrestrpos function need only be provided if the access - method supports ordered scans. If it doesn't, - the amrestrpos field in its IndexAmRoutine - struct may be set to NULL. + The amposreset function can only be provided if the + access method supports ordered scans through the amgetbatch + interface. If it doesn't, the amposreset field + in its IndexAmRoutine struct should be set to + NULL. Index AMs that don't have any private state that might need to be + invalidated might still find it useful to provide an empty + amposreset function; if amposreset + is set to NULL, the core system will assume that it is unsafe to restore a + marked position. @@ -988,30 +1113,47 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype); - The amgettuple function has a direction argument, + The amgettuple and amgetbatch + functions have a direction argument, which can be either ForwardScanDirection (the normal case) or BackwardScanDirection. If the first call after amrescan specifies BackwardScanDirection, then the set of matching index entries is to be scanned back-to-front rather than in - the normal front-to-back direction, so amgettuple must return - the last matching tuple in the index, rather than the first one as it - normally would. (This will only occur for access - methods that set amcanorder to true.) After the - first call, amgettuple must be prepared to advance the scan in + the normal front-to-back direction. In this case, + amgettuple must return the last matching tuple in the + index, rather than the first one as it normally would. Similarly, + amgetbatch must return the last matching batch of items + when either the first call after amrescan specifies + BackwardScanDirection, or a subsequent call has + NULL as its priorbatch argument + (indicating a backward scan restart). (This backward-scan behavior will + only occur for access methods that set amcanorder + to true.) After the first call, both amgettuple and + amgetbatch must be prepared to advance the scan in either direction from the most recently returned entry. (But if amcanbackward is false, all subsequent calls will have the same direction as the first one.) - Access methods that support ordered scans must support marking a - position in a scan and later returning to the marked position. The same - position might be restored multiple times. However, only one position need - be remembered per scan; a new ammarkpos call overrides the - previously marked position. An access method that does not support ordered - scans need not provide ammarkpos and amrestrpos - functions in IndexAmRoutine; set those pointers to NULL - instead. + Access methods using the amgetbatch interface may + support marking a position in a scan and later returning to + the marked position, though this is optional. If the same marked position + might be restored multiple times, the core system manages marking and + restoration through the index_batch_mark_pos and + index_batch_restore_pos internal functions. When a + marked position is restored, the index AM is notified via the + amposreset callback so it can invalidate any private + state that independently tracks the scan's progress (such as array key + state). + + + + The amposreset function in IndexAmRoutine + should be set to NULL for access methods that do not support mark/restore. + For access methods that do support this feature, amposreset + must be provided (though it can be a no-op function if the AM has no private + state to invalidate). @@ -1180,6 +1322,94 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype); reduce the frequency of such transaction cancellations. + + Batch Scanning and Buffer Pin Management + + + Index access methods that implement the amgetbatch + interface must cooperate with the core system to manage buffer pins in a + way that prevents concurrent VACUUM from creating + TID recycling hazards. Unlike amgettuple scans, + which keep the index access method in control of scan progression, + amgetbatch scans give control to the table access + method, which may fetch table tuples in a different order than the index + entries were returned. This creates the need for explicit buffer pin + management to ensure the table access method does not confuse a recycled + TID with the original row it meant to reference. + + + + When amgetbatch returns a batch, the batch's + associated index page may be retained in a buffer with a pin held on it. + This pin serves as an interlock: VACUUM cannot recycle + TIDs on a pinned page. The buffer pin protects only the table access + method's ability to map TIDs to rows correctly; it does not protect the + index structure itself. Index access methods may use pins for other + purposes (for example, maintaining a descent stack of pinned pages), but + those uses are internal to the access method and independent of the + table-AM synchronization described here. + + + + Whether a pin should be held when returning a batch is controlled by the + dropPin flag in the BatchQueue + structure. When dropPin is true, the index access method + drops the pin before returning the batch, which avoids blocking + VACUUM. When dropPin is false, the + index access method must hold the pin until the batch is freed via + amfreebatch. The core system sets the + dropPin flag based on scan type: it is true for + MVCC-compliant snapshots on logged relations (unless index-only scans are + in use), and false otherwise. + + + + When dropPin is true and the index access method is + eager about dropping pins, it must save the page's LSN in the batch before + returning. Later, when amfreebatch is called and the + access method wishes to set LP_DEAD bits to mark dead + tuples, it must verify that the page's LSN has not changed since the batch + was read. If the LSN has changed, the page may have been modified by + concurrent activity and it is unsafe to set LP_DEAD bits. + This LSN-based validation scheme protects against TID recycling races when + pins have been dropped. When dropPin is false, the pin + prevents unsafe concurrent removal of table TID references by + VACUUM, so no LSN check is necessary. + + + + The core system provides three utility functions for managing batch + resources: + indexam_util_batch_alloc allocates a new batch or + reuses a cached one, + indexam_util_batch_unlock drops the lock and + conditionally drops the pin on a batch's index page (based on the + dropPin setting), and + indexam_util_batch_release frees or caches a batch. + Index access methods should use these utilities rather than managing + buffers directly. The src/backend/access/nbtree/ + implementation provides a reference example of correct usage. + + + + Note that amfreebatch is called only by the core code + and table access method, never by the index access method itself. The + index AM must not assume that a call to amfreebatch + will take place before another call to amgetbatch + (for the same index scan) takes place. + + + + The index AM must also avoid relying on the core code calling + amfreebatch with batches that are in any particular + order. For example, it is not okay for an index AM to assume that calls + to amfreebatch will take place in the same order as + the amgetbatch calls that initially + allocated/populated/returned each batch. + + + + diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index 77c5a763d..55b7222e9 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -1152,12 +1152,13 @@ WITH ( MODULUS numeric_literal, REM - The access method must support amgettuple (see ); at present this means GIN - cannot be used. Although it's allowed, there is little point in using - B-tree or hash indexes with an exclusion constraint, because this - does nothing that an ordinary unique constraint doesn't do better. - So in practice the access method will always be GiST or + The access method must support either amgettuple + or amgetbatch (see ); at + present this means GIN cannot be used. Although + it's allowed, there is little point in using B-tree or hash indexes + with an exclusion constraint, because this does nothing that an + ordinary unique constraint doesn't do better. So in practice the + access method will always be GiST or SP-GiST. diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index a34382a5f..5526771b5 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -317,10 +317,10 @@ dihandler(PG_FUNCTION_ARGS) amroutine->ambeginscan = dibeginscan; amroutine->amrescan = direscan; amroutine->amgettuple = NULL; + amroutine->amgetbatch = NULL; amroutine->amgetbitmap = NULL; amroutine->amendscan = diendscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; + amroutine->amposreset = NULL; amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9dd65b102..6812d2fd7 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -220,8 +220,6 @@ BTScanInsertData BTScanKeyPreproc BTScanOpaque BTScanOpaqueData -BTScanPosData -BTScanPosItem BTShared BTSortArrayContext BTSpool @@ -3450,12 +3448,10 @@ amgettuple_function aminitparallelscan_function aminsert_function aminsertcleanup_function -ammarkpos_function amoptions_function amparallelrescan_function amproperty_function amrescan_function -amrestrpos_function amtranslate_cmptype_function amtranslate_strategy_function amvacuumcleanup_function -- 2.51.0