From 93d7675cb5f56629b948e3e186de7015d67c7070 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 1 May 2025 20:23:18 +0200 Subject: [PATCH v20250709 2/6] prefetch for btree indexes Implements the bt_stream_read_next() callback, returning blocks from the current BTScanOpaque. --- src/backend/access/nbtree/nbtree.c | 162 ++++++++++++++++++++++++++ src/backend/access/nbtree/nbtsearch.c | 46 ++++++++ src/include/access/nbtree.h | 9 ++ 3 files changed, 217 insertions(+) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 619b356e848..0fa4af79dac 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -21,9 +21,11 @@ #include "access/nbtree.h" #include "access/relscan.h" #include "access/stratnum.h" +#include "access/visibilitymap.h" #include "commands/progress.h" #include "commands/vacuum.h" #include "nodes/execnodes.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "storage/bulk_write.h" #include "storage/condition_variable.h" @@ -329,6 +331,107 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) return ntids; } +/* + * bt_stream_read_next + * Return the next block to read from the read stream. + * + * Returns the next block from the current leaf page. The first block is + * when accessing the first tuple, after already receiving the TID from the + * index (for the item itemIndex points at). + * + * With index-only scans this skips all-visible pages. The visibility info + * is stored, so that we can later pass it to the scan (we must not access + * the VM again, the bit might have changes, and the read stream would get + * out of sync (we'd get different blocks than we expect expect). + * + * Returns the block number to get from the read stream. InvalidBlockNumber + * means we've ran out of item on the current leaf page - the stream will + * end, and we'll need to reset it after reading the next page (or after + * changing the scan direction). + * + * XXX Should skip duplicate blocks (for correlated indexes). But that's + * not implemented yet. + */ +static BlockNumber +bt_stream_read_next(ReadStream *stream, + void *callback_private_data, + void *per_buffer_data) +{ + IndexScanDesc scan = (IndexScanDesc) callback_private_data; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ScanDirection dir = so->currPos.dir; + BlockNumber block = InvalidBlockNumber; + + /* + * Is this the first request for the read stream (possibly after a reset)? + * If yes, initialize the stream to the current item (itemIndex). + */ + if (so->currPos.streamIndex == -1) + so->currPos.streamIndex = so->currPos.itemIndex; + + /* + * Find the next block to read. For plain index scans we will return the + * very next item, but with index-only scans we skip TIDs from all-visible + * pages (because we won't read those). + */ + while ((so->currPos.streamIndex >= so->currPos.firstItem) && + (so->currPos.streamIndex <= so->currPos.lastItem)) + { + ItemPointer tid; + BTScanPosItem *item; + + item = &so->currPos.items[so->currPos.streamIndex]; + + tid = &item->heapTid; + block = ItemPointerGetBlockNumber(tid); + + /* + * For index-only scans, check the VM and remember the result. If the page + * is all-visible, don't return the block number, try reading the next one. + * + * XXX Maybe this could use the same logic to check for duplicate blocks, + * and reuse the VM result if possible. + */ + if (scan->xs_want_itup) + { + if (!item->allVisibleSet) + { + item->allVisibleSet = true; + item->allVisible = VM_ALL_VISIBLE(scan->heapRelation, + ItemPointerGetBlockNumber(tid), + &so->vmBuffer); + } + + /* don't prefetch this all-visible block, try the next one */ + if (item->allVisible) + block = InvalidBlockNumber; + } + + /* advance to the next item, assuming the current scan direction */ + if (ScanDirectionIsForward(dir)) + { + so->currPos.streamIndex++; + } + else + { + so->currPos.streamIndex--; + } + + /* don't return the same block twice (and remember this one) */ + if (so->lastBlock == block) + block = InvalidBlockNumber; + + /* Did we find a valid block? If yes, we're done. */ + if (block != InvalidBlockNumber) + break; + } + + /* remember the block we're returning */ + so->lastBlock = block; + + return block; +} + /* * btbeginscan() -- start a scan on a btree index */ @@ -364,6 +467,12 @@ btbeginscan(Relation heap, Relation index, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + /* buffer for accessing the VM in read_next callback */ + so->vmBuffer = InvalidBuffer; + + /* nothing returned */ + so->lastBlock = InvalidBlockNumber; + /* * We don't know yet whether the scan will be index-only, so we do not * allocate the tuple workspace arrays until btrescan. However, we set up @@ -375,6 +484,27 @@ btbeginscan(Relation heap, Relation index, int nkeys, int norderbys) scan->opaque = so; + /* + * Initialize the read stream too, to opt in into prefetching. + * + * XXX We create a stream depending on the GUC, and only if the heap rel + * is provided. This means we don't initialize the stream even for bitmap + * scans, which don't use it. + * + * XXX The table has to be already locked by the query, so NoLock. Too + * bad the heapRelation does not get passed here. + */ + if (enable_indexscan_prefetch && heap) + { + scan->xs_rs = read_stream_begin_relation(READ_STREAM_DEFAULT, + NULL, + heap, + MAIN_FORKNUM, + bt_stream_read_next, + scan, + 0); + } + return scan; } @@ -461,6 +591,14 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ so->numArrayKeys = 0; /* ditto */ + + /* reset the read stream, to start over */ + if (scan->xs_rs) + { + so->currPos.streamIndex = -1; + so->lastBlock = InvalidBlockNumber; + read_stream_reset(scan->xs_rs); + } } /* @@ -483,6 +621,22 @@ btendscan(IndexScanDesc scan) so->markItemIndex = -1; BTScanPosUnpinIfPinned(so->markPos); + /* release the VM buffer */ + if (so->vmBuffer != InvalidBuffer) + { + ReleaseBuffer(so->vmBuffer); + so->vmBuffer = InvalidBuffer; + } + + /* + * XXX I wonder if maybe the stream should be managed by the indexam.c + * layer, not by each index AM. + */ + + /* terminate the read stream (and close the heap) */ + if (scan->xs_rs) + read_stream_end(scan->xs_rs); + /* No need to invalidate positions, the RAM is about to be freed. */ /* Release storage */ @@ -581,6 +735,14 @@ btrestrpos(IndexScanDesc scan) else BTScanPosInvalidate(so->currPos); } + + /* we're restored the scan position, reset the read stream */ + if (scan->xs_rs) + { + so->currPos.streamIndex = -1; + so->lastBlock = InvalidBlockNumber; + read_stream_reset(scan->xs_rs); + } } /* diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 4af1ff1e9e5..91e4fe5ec95 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -17,6 +17,7 @@ #include "access/nbtree.h" #include "access/relscan.h" +#include "access/visibilitymap.h" #include "access/xact.h" #include "miscadmin.h" #include "pgstat.h" @@ -2045,6 +2046,21 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, */ Assert(!pstate.forcenonrequired); + /* + * Reset the read stream, to restart it for the new page. + * + * XXX Maybe we should not reset prefetch distance to 0, but start from + * a somewhat higher value. We're merely continuing the same scan as + * before ... maybe reduce it a bit, to not harm LIMIT queries, but not + * reset it all the way to 0. + */ + if (scan->xs_rs) + { + so->currPos.streamIndex = -1; + so->lastBlock = InvalidBlockNumber; + read_stream_reset(scan->xs_rs); + } + return (so->currPos.firstItem <= so->currPos.lastItem); } @@ -2059,6 +2075,11 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; + + /* initialize visibility flags */ + currItem->allVisibleSet = false; + currItem->allVisible = false; + if (so->currTuples) { Size itupsz = IndexTupleSize(itup); @@ -2089,6 +2110,11 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, currItem->heapTid = *heapTid; currItem->indexOffset = offnum; + + /* initialize visibility flags */ + currItem->allVisibleSet = false; + currItem->allVisible = false; + if (so->currTuples) { /* Save base IndexTuple (truncate posting list) */ @@ -2126,6 +2152,10 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, currItem->heapTid = *heapTid; currItem->indexOffset = offnum; + /* initialize visibility flags */ + currItem->allVisibleSet = false; + currItem->allVisible = false; + /* * Have index-only scans return the same base IndexTuple for every TID * that originates from the same posting list @@ -2150,6 +2180,22 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) /* Return next item, per amgettuple contract */ scan->xs_heaptid = currItem->heapTid; + + /* + * XXX If this is index-only scan and we haven't checked the VM yet, do + * that now. We need to make sure scan->xs_visible is set correctly even + * if the scan is not using a read stream. + */ + if (scan->xs_want_itup && !currItem->allVisibleSet) + { + currItem->allVisibleSet = true; + currItem->allVisible + = VM_ALL_VISIBLE(scan->heapRelation, + ItemPointerGetBlockNumber(&currItem->heapTid), + &so->vmBuffer); + } + + scan->xs_visible = currItem->allVisible; if (so->currTuples) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e6e52210b15..e307fd9bf7f 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -957,6 +957,8 @@ typedef struct BTScanPosItem /* what we remember about each match */ ItemPointerData heapTid; /* TID of referenced heap item */ OffsetNumber indexOffset; /* index item's location within page */ LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ + bool allVisibleSet; /* did we set the VM flag already? */ + bool allVisible; /* VM info (for IOS) */ } BTScanPosItem; typedef struct BTScanPosData @@ -995,6 +997,7 @@ typedef struct BTScanPosData int firstItem; /* first valid index in items[] */ int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ + int streamIndex; /* item returned to the read stream */ BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */ } BTScanPosData; @@ -1067,6 +1070,9 @@ typedef struct BTScanOpaqueData FmgrInfo *orderProcs; /* ORDER procs for required equality keys */ MemoryContext arrayContext; /* scan-lifespan context for array data */ + /* buffer for accessing VM in index-only scans */ + Buffer vmBuffer; + /* info about killed items if any (killedItems is NULL if never used) */ int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ @@ -1089,6 +1095,9 @@ typedef struct BTScanOpaqueData */ int markItemIndex; /* itemIndex, or -1 if not valid */ + /* last block returned by the read_next stream callback */ + BlockNumber lastBlock; + /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ -- 2.50.0