From 8e918cc08e7eae2d5c92f6937004e63d70131deb Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sat, 15 Nov 2025 14:03:58 -0500
Subject: [PATCH v5 2/4] Add prefetching to index scans using batch interfaces.

This commit implements I/O prefetching for index scans, made possible by
the recent addition of batching interfaces to both the table AM and
index AM APIs.

The amgetbatch index AM interface provides batches of TIDs (rather than
one at a time) from a single index leaf page, and allows multiple
batches to be held in memory/pinned simultaneously.  This gives the
table AM the freedom to readahead within an index scan, which is crucial
for I/O prefetching with certain workloads (workloads that would
otherwise be unable to keep a sufficiently high prefetch distance for
heap block I/O).  Prefetching is implemented using a read stream under
the control of the table AM.

XXX When the batch queue reaches capacity, the stream pauses until
the scan catches up and frees some batches.  We need a more principled
approach here.  Essentially, we need infrastructure that allows a read
stream call back to tell the read stream to "back off" without it fully
ending/resetting the read stream.

Note: For now prefetching is temporarily disabled during index-only
scans, pending the reintroduction of visibility map caching in batches.
Previous versions of the patch series had that, but it was removed when
we moved over to the new table AM interface.

Author: Tomas Vondra <tomas@vondra.me>
Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Andres Freund <andres@anarazel.de>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Discussion: https://postgr.es/m/cf85f46f-b02f-05b2-5248-5000b894ebab@enterprisedb.com
---
 src/include/access/relscan.h                  |  33 +-
 src/include/access/tableam.h                  |  15 +
 src/include/optimizer/cost.h                  |   1 +
 src/backend/access/heap/heapam_handler.c      | 294 +++++++++++++++++-
 src/backend/access/index/indexam.c            |  10 +-
 src/backend/access/index/indexbatch.c         |  46 ++-
 src/backend/optimizer/path/costsize.c         |   1 +
 src/backend/storage/aio/read_stream.c         |  14 +-
 src/backend/utils/misc/guc_parameters.dat     |   7 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/test/regress/expected/sysviews.out        |   3 +-
 11 files changed, 413 insertions(+), 12 deletions(-)

diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 1c9faaad5..1157ba9ba 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -20,6 +20,7 @@
 #include "nodes/tidbitmap.h"
 #include "port/atomics.h"
 #include "storage/buf.h"
+#include "storage/read_stream.h"
 #include "storage/relfilelocator.h"
 #include "storage/spin.h"
 #include "utils/relcache.h"
@@ -124,6 +125,7 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker;
 typedef struct IndexFetchTableData
 {
 	Relation	rel;
+	ReadStream *rs;
 } IndexFetchTableData;
 
 /*
@@ -220,8 +222,14 @@ typedef struct BatchIndexScanData *BatchIndexScan;
  * Maximum number of batches (leaf pages) we can keep in memory.  We need a
  * minimum of two, since we'll only consider releasing one batch when another
  * is read.
+ *
+ * The choice of 64 batches is arbitrary.  It's about 1MB of data with 8KB
+ * pages (512kB for pages, and then a bit of overhead). We should not really
+ * need this many batches in most cases, though. The read stream looks ahead
+ * just enough to queue enough IOs, adjusting the distance (TIDs, but
+ * ultimately the number of future batches) to meet that.
  */
-#define INDEX_SCAN_MAX_BATCHES		2
+#define INDEX_SCAN_MAX_BATCHES		64
 #define INDEX_SCAN_CACHE_BATCHES	2
 #define INDEX_SCAN_BATCH_COUNT(scan) \
 	((scan)->batchqueue->nextBatch - (scan)->batchqueue->headBatch)
@@ -268,12 +276,35 @@ typedef struct BatchIndexScanData *BatchIndexScan;
  */
 typedef struct BatchQueue
 {
+	bool		reset;
+
+	/*
+	 * Did we disable prefetching/use of a read stream because it didn't pay
+	 * for itself?
+	 */
+	bool		prefetchingLockedIn;
+	bool		disabled;
+
+	/*
+	 * During prefetching, currentPrefetchBlock is the table AM block number
+	 * that was returned by our read stream callback most recently.  Used to
+	 * suppress duplicate successive read stream block requests.
+	 *
+	 * Prefetching can still perform non-successive requests for the same
+	 * block number (in general we're prefetching in exactly the same order
+	 * that the scan will return table AM TIDs in).  We need to avoid
+	 * duplicate successive requests because table AMs expect to be able to
+	 * hang on to buffer pins across table_index_fetch_tuple calls.
+	 */
+	BlockNumber currentPrefetchBlock;
+
 	/* Current scan direction, for the currently loaded batches */
 	ScanDirection direction;
 
 	/* current positions in batches[] for scan */
 	BatchQueueItemPos readPos;	/* read position */
 	BatchQueueItemPos markPos;	/* mark/restore position */
+	BatchQueueItemPos streamPos;	/* stream position (for prefetching) */
 
 	BatchIndexScan markBatch;
 
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index a543682bb..75c6b5276 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -448,6 +448,21 @@ typedef struct TableAmRoutine
 									   ScanDirection direction,
 									   TupleTableSlot *slot);
 
+	/*
+	 * Read stream callback, used to perform I/O prefetching of table AM pages
+	 * during `index_getnext_slot` index scans.
+	 *
+	 * This callback is directly passed to read_stream_begin_relation, from
+	 * batch_getnext routine.  It will only be used during scans whose index
+	 * AM uses the amgetbatch interface.  (Scans with amgettuple-based index
+	 * AMs cannot reasonably be used for I/O prefetching, since its opaque
+	 * tuple-at-a-time interface makes it impossible to schedule index scan
+	 * work sensibly.)
+	 */
+	BlockNumber (*index_getnext_stream) (ReadStream *stream,
+										 void *callback_private_data,
+										 void *per_buffer_data);
+
 	/*
 	 * Fetch tuple at `tid` into `slot`, after doing a visibility test
 	 * according to `snapshot`. If a tuple was found and passed the visibility
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index b523bcda8..00f4c3d00 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -51,6 +51,7 @@ extern PGDLLIMPORT Cost disable_cost;
 extern PGDLLIMPORT int max_parallel_workers_per_gather;
 extern PGDLLIMPORT bool enable_seqscan;
 extern PGDLLIMPORT bool enable_indexscan;
+extern PGDLLIMPORT bool enable_indexscan_prefetch;
 extern PGDLLIMPORT bool enable_indexonlyscan;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 503ff095e..b9d42b15a 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -59,6 +59,9 @@ static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
 static bool BitmapHeapScanNextBlock(TableScanDesc scan,
 									bool *recheck,
 									uint64 *lossy_pages, uint64 *exact_pages);
+static BlockNumber heapam_getnext_stream(ReadStream *stream,
+										 void *callback_private_data,
+										 void *per_buffer_data);
 
 
 /* ------------------------------------------------------------------------
@@ -84,6 +87,7 @@ heapam_index_fetch_begin(Relation rel)
 	IndexFetchHeapData *hscan = palloc_object(IndexFetchHeapData);
 
 	hscan->xs_base.rel = rel;
+	hscan->xs_base.rs = NULL;
 	hscan->xs_cbuf = InvalidBuffer;
 	hscan->xs_blk = InvalidBlockNumber;
 	hscan->vmbuf = InvalidBuffer;
@@ -96,6 +100,9 @@ heapam_index_fetch_reset(IndexFetchTableData *scan)
 {
 	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
 
+	if (scan->rs)
+		read_stream_reset(scan->rs);
+
 	/* deliberately don't drop VM buffer pin here */
 	if (BufferIsValid(hscan->xs_cbuf))
 	{
@@ -112,6 +119,9 @@ heapam_index_fetch_end(IndexFetchTableData *scan)
 
 	heapam_index_fetch_reset(scan);
 
+	if (scan->rs)
+		read_stream_end(scan->rs);
+
 	if (hscan->vmbuf != InvalidBuffer)
 	{
 		ReleaseBuffer(hscan->vmbuf);
@@ -149,7 +159,10 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 		 * When using a read stream, the stream will already know which block
 		 * number comes next (though an assertion will verify a match below)
 		 */
-		hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk);
+		if (scan->rs)
+			hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
+		else
+			hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk);
 
 		/*
 		 * Prune page when it is pinned for the first time
@@ -343,13 +356,24 @@ heapam_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/* Initialize direction on first call */
 	if (batchqueue->direction == NoMovementScanDirection)
 		batchqueue->direction = direction;
+	else if (unlikely(batchqueue->disabled && scan->xs_heapfetch->rs))
+	{
+		/*
+		 * Handle cancelling the use of the read stream for prefetching
+		 */
+		batch_reset_pos(&batchqueue->streamPos);
+
+		read_stream_reset(scan->xs_heapfetch->rs);
+		scan->xs_heapfetch->rs = NULL;
+	}
 	else if (unlikely(batchqueue->direction != direction))
 	{
 		/*
 		 * Handle a change in the scan's direction.
 		 *
 		 * Release future batches properly, to make it look like the current
-		 * batch is the only one we loaded.
+		 * batch is the only one we loaded. Also reset the stream position, as
+		 * if we are just starting the scan.
 		 */
 		while (batchqueue->nextBatch > batchqueue->headBatch + 1)
 		{
@@ -364,10 +388,16 @@ heapam_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 		/*
 		 * Remember the new direction, and make sure the scan is not marked as
 		 * "finished" (we might have already read the last batch, but now we
-		 * need to start over).
+		 * need to start over). Do this before resetting the stream - it
+		 * should not invoke the callback until the first read, but it may
+		 * seem a bit confusing otherwise.
 		 */
 		batchqueue->direction = direction;
 		scan->finished = false;
+		batch_reset_pos(&batchqueue->streamPos);
+
+		if (scan->xs_heapfetch->rs)
+			read_stream_reset(scan->xs_heapfetch->rs);
 	}
 
 	/* shortcut for the read position, for convenience */
@@ -409,6 +439,37 @@ heapam_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 				BatchIndexScan headBatch = INDEX_SCAN_BATCH(scan,
 															batchqueue->headBatch);
 
+				/*
+				 * XXX When advancing readPos, the streamPos may get behind as
+				 * we're only advancing it when actually requesting heap
+				 * blocks. But we may not do that often enough - e.g. IOS may
+				 * not need to access all-visible heap blocks, so the
+				 * read_next callback does not get invoked for a long time.
+				 * It's possible the stream gets so far behind the position
+				 * that is becomes invalid, as we already removed the batch.
+				 * But that means we don't need any heap blocks until the
+				 * current read position -- if we did, we would not be in this
+				 * situation (or it's a sign of a bug, as those two places are
+				 * expected to be in sync). So if the streamPos still points
+				 * at the batch we're about to free, reset the position --
+				 * we'll set it to readPos in the read_next callback later on.
+				 *
+				 * XXX This can happen after the queue gets full, we "pause"
+				 * the stream, and then reset it to continue. But I think that
+				 * just increases the probability of hitting the issue, it's
+				 * just more chance to to not advance the streamPos, which
+				 * depends on when we try to fetch the first heap block after
+				 * calling read_stream_reset().
+				 *
+				 * FIXME Simplify/clarify/shorten this comment. Can it
+				 * actually happen, if we never pull from the stream in IOS?
+				 * We probably don't look ahead for the first call.
+				 */
+				if (unlikely(batchqueue->streamPos.batch == batchqueue->headBatch))
+				{
+					batch_reset_pos(&batchqueue->streamPos);
+				}
+
 				/* Free the head batch (except when it's markBatch) */
 				batch_free(scan, headBatch);
 
@@ -427,8 +488,38 @@ heapam_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 		}
 
 		/*
-		 * Failed to advance the read position.  Have indexbatch.c utility
-		 * routine load another batch into our queue (next in this direction).
+		 * We failed to advance, i.e. we ran out of currently loaded batches.
+		 * So if we filled the queue, this is a good time to reset the stream
+		 * (before we try loading the next batch).
+		 */
+		if (unlikely(batchqueue->reset))
+		{
+			batchqueue->reset = false;
+			batchqueue->currentPrefetchBlock = InvalidBlockNumber;
+
+			/*
+			 * Need to reset the stream position, it might be too far behind.
+			 * Ultimately we want to set it to readPos, but we can't do that
+			 * yet - readPos still point sat the old batch, so just reset it
+			 * and we'll init it to readPos later in the callback.
+			 */
+			batch_reset_pos(&batchqueue->streamPos);
+
+			if (scan->xs_heapfetch->rs)
+				read_stream_reset(scan->xs_heapfetch->rs);
+		}
+
+		/*
+		 * Failed to advance the read position, so try reading the next batch.
+		 * If this fails, we're done - there's nothing more to load.
+		 *
+		 * Most of the batches should be loaded from read_stream_next_buffer,
+		 * but we need to call batch_getnext here too, for two reasons. First,
+		 * the read_stream only gets working after we try fetching the first
+		 * heap tuple, so we need to load the initial batch (the head).
+		 * Second, while most batches will be preloaded by the stream thanks
+		 * to prefetching, it's possible to set effective_io_concurrency=0,
+		 * and in that case all the batches get loaded from here.
 		 */
 		if (!batch_getnext(scan, direction))
 		{
@@ -448,6 +539,198 @@ heapam_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	return NULL;
 }
 
+/*
+ * Controls when we cancel use of a read stream to do prefetching
+ */
+#define INDEX_SCAN_MIN_DISTANCE_NBATCHES	20
+#define INDEX_SCAN_MIN_TUPLE_DISTANCE		7
+
+/*
+ * heapam_getnext_stream
+ *		return the next block to pass to the read stream
+ *
+ * This assumes the "current" scan direction, requested by the caller.
+ *
+ * If the direction changes before consuming all blocks, we'll reset the stream
+ * and start from scratch. The scan direction change is handled elsewhere.
+ * Here we rely on having the correct value in batchqueue->direction.
+ *
+ * The position of the read_stream is stored in streamPos, which may be ahead of
+ * the current readPos (which is what got consumed by the scan).
+ *
+ * The streamPos can however also get behind readPos too, when some blocks are
+ * skipped and not returned to the read_stream. An example is an index scan on
+ * a correlated index, with many duplicate blocks are skipped, or an IOS where
+ * all-visible blocks are skipped.
+ *
+ * The initial batch is always loaded from batch_getnext_tid(). We don't
+ * get here until the first read_stream_next_buffer() call, when pulling the
+ * first heap tuple from the stream. After that, most batches should be loaded
+ * by this callback, driven by the read_stream look-ahead distance. However,
+ * with disabled prefetching (that is, with effective_io_concurrency=0), all
+ * batches will be loaded in batch_getnext_tid.
+ *
+ * It's possible we got here only fairly late in the scan, e.g. if many tuples
+ * got skipped in the index-only scan, etc. In this case just use the read
+ * position as a streamPos starting point.
+ */
+static BlockNumber
+heapam_getnext_stream(ReadStream *stream, void *callback_private_data,
+					  void *per_buffer_data)
+{
+	IndexScanDesc scan = (IndexScanDesc) callback_private_data;
+	BatchQueue *batchqueue = scan->batchqueue;
+	BatchQueueItemPos *streamPos = &batchqueue->streamPos;
+	ScanDirection direction = batchqueue->direction;
+
+	/* By now we should know the direction of the scan. */
+	Assert(direction != NoMovementScanDirection);
+
+	/*
+	 * The read position (readPos) has to be valid.
+	 *
+	 * We initialize/advance it before even attempting to read the heap tuple,
+	 * and it gets invalidated when we reach the end of the scan (but then we
+	 * don't invoke the callback again).
+	 *
+	 * XXX This applies to the readPos. We'll use streamPos to determine which
+	 * blocks to pass to the stream, and readPos may be used to initialize it.
+	 */
+	batch_assert_pos_valid(scan, &batchqueue->readPos);
+
+	/*
+	 * Try to advance the streamPos to the next item, and if that doesn't
+	 * succeed (if there are no more items in loaded batches), try loading the
+	 * next one.
+	 *
+	 * FIXME Unlike batch_getnext_tid, this can loop more than twice. If many
+	 * blocks get skipped due to currentPrefetchBlock or all-visibility (per
+	 * the "prefetch" callback), we get to load additional batches. In the
+	 * worst case we hit the INDEX_SCAN_MAX_BATCHES limit and have to "pause"
+	 * the stream.
+	 */
+	while (true)
+	{
+		bool		advanced = false;
+
+		/*
+		 * If the stream position has not been initialized yet, set it to the
+		 * current read position. This is the item the caller is trying to
+		 * read, so it's what we should return to the stream.
+		 */
+		if (INDEX_SCAN_POS_INVALID(streamPos))
+		{
+			*streamPos = batchqueue->readPos;
+			advanced = true;
+		}
+		else if (heap_batch_advance_pos(scan, streamPos, direction))
+		{
+			advanced = true;
+		}
+
+		/*
+		 * FIXME Maybe check the streamPos is not behind readPos?
+		 *
+		 * FIXME Actually, could streamPos get stale/lagging behind readPos,
+		 * and if yes how much. Could it get so far behind to not be valid,
+		 * pointing at a freed batch? In that case we can't even advance it,
+		 * and we should just initialize it to readPos. We might do that
+		 * anyway, I guess, just to save on "pointless" advances (it must
+		 * agree with readPos, we can't allow "retroactively" changing the
+		 * block sequence).
+		 */
+
+		/*
+		 * If we advanced the position, either return the block for the TID,
+		 * or skip it (and then try advancing again).
+		 *
+		 * The block may be "skipped" for two reasons. First, the caller may
+		 * define a "prefetch" callback that tells us to skip items (IOS does
+		 * this to skip all-visible pages). Second, currentPrefetchBlock is
+		 * used to skip duplicate block numbers (a sequence of TIDS for the
+		 * same block).
+		 */
+		if (advanced)
+		{
+			BatchIndexScan streamBatch = INDEX_SCAN_BATCH(scan, streamPos->batch);
+			ItemPointer tid = &streamBatch->items[streamPos->item].heapTid;
+
+			DEBUG_LOG("heapam_getnext_stream: item %d, TID (%u,%u)",
+					  streamPos->item,
+					  ItemPointerGetBlockNumber(tid),
+					  ItemPointerGetOffsetNumber(tid));
+
+			/* same block as before, don't need to read it */
+			if (batchqueue->currentPrefetchBlock == ItemPointerGetBlockNumber(tid))
+			{
+				DEBUG_LOG("heapam_getnext_stream: skip block (currentPrefetchBlock)");
+				continue;
+			}
+
+			batchqueue->currentPrefetchBlock = ItemPointerGetBlockNumber(tid);
+
+			return batchqueue->currentPrefetchBlock;
+		}
+
+		/*
+		 * Couldn't advance the position, no more items in the loaded batches.
+		 * Try loading the next batch - if that succeeds, try advancing again
+		 * (this time the advance should work, but we may skip all the items).
+		 *
+		 * If we fail to load the next batch, we're done.
+		 */
+		if (!batch_getnext(scan, direction))
+			break;
+
+		/*
+		 * Consider disabling prefetching when we can't keep a sufficiently
+		 * large "index tuple distance" between readPos and streamPos.
+		 *
+		 * Only consider doing this when we're not on the scan's initial
+		 * batch, when readPos and streamPos share the same batch.
+		 */
+		if (!scan->finished && !batchqueue->prefetchingLockedIn)
+		{
+			int			itemdiff;
+
+			if (streamPos->batch <= INDEX_SCAN_MIN_DISTANCE_NBATCHES)
+			{
+				/* Too early to check if prefetching should be disabled */
+			}
+			else if (batchqueue->readPos.batch == streamPos->batch)
+			{
+				BatchQueueItemPos *readPos = &batchqueue->readPos;
+
+				if (ScanDirectionIsForward(direction))
+					itemdiff = streamPos->item - readPos->item;
+				else
+				{
+					BatchIndexScan readBatch =
+						INDEX_SCAN_BATCH(scan, readPos->batch);
+
+					itemdiff = (readPos->item - readBatch->firstItem) -
+						(streamPos->item - readBatch->firstItem);
+				}
+
+				if (itemdiff < INDEX_SCAN_MIN_TUPLE_DISTANCE)
+				{
+					batchqueue->disabled = true;
+					return InvalidBlockNumber;
+				}
+				else
+				{
+					batchqueue->prefetchingLockedIn = true;
+				}
+			}
+			else
+				batchqueue->prefetchingLockedIn = true;
+		}
+	}
+
+	/* no more items in this scan */
+	return InvalidBlockNumber;
+}
+
 /* ----------------
  *		index_fetch_heap - get the scan's next heap tuple
  *
@@ -3116,6 +3399,7 @@ static const TableAmRoutine heapam_methods = {
 	.index_fetch_reset = heapam_index_fetch_reset,
 	.index_fetch_end = heapam_index_fetch_end,
 	.index_getnext_slot = heapam_index_getnext_slot,
+	.index_getnext_stream = heapam_getnext_stream,
 	.index_fetch_tuple = heapam_index_fetch_tuple,
 
 	.tuple_insert = heapam_tuple_insert,
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 39363eff3..0d2cfa605 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -467,7 +467,15 @@ index_restrpos(IndexScanDesc scan)
 	CHECK_SCAN_PROCEDURE(amgetbatch);
 	CHECK_SCAN_PROCEDURE(amposreset);
 
-	/* release resources (like buffer pins) from table accesses */
+	/*
+	 * release resources (like buffer pins) from table accesses
+	 *
+	 * XXX: Currently, the distance is always remembered across any
+	 * read_stream_reset calls (to work around the scan->batchqueue->reset
+	 * behavior of resetting the stream to deal with running out of batches).
+	 * We probably _should_ be forgetting the distance when we reset the
+	 * stream here (through our table_index_fetch_reset call), though.
+	 */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
diff --git a/src/backend/access/index/indexbatch.c b/src/backend/access/index/indexbatch.c
index b6f24b379..29207276d 100644
--- a/src/backend/access/index/indexbatch.c
+++ b/src/backend/access/index/indexbatch.c
@@ -75,11 +75,16 @@ index_batch_init(IndexScanDesc scan)
 		(!scan->xs_want_itup && IsMVCCSnapshot(scan->xs_snapshot) &&
 		 RelationNeedsWAL(scan->indexRelation));
 	scan->finished = false;
+	scan->batchqueue->reset = false;
+	scan->batchqueue->prefetchingLockedIn = false;
+	scan->batchqueue->disabled = false;
+	scan->batchqueue->currentPrefetchBlock = InvalidBlockNumber;
 	scan->batchqueue->direction = NoMovementScanDirection;
 
 	/* positions in the queue of batches */
 	batch_reset_pos(&scan->batchqueue->readPos);
 	batch_reset_pos(&scan->batchqueue->markPos);
+	batch_reset_pos(&scan->batchqueue->streamPos);
 
 	scan->batchqueue->markBatch = NULL;
 	scan->batchqueue->headBatch = 0;	/* initial head batch */
@@ -123,7 +128,23 @@ batch_getnext(IndexScanDesc scan, ScanDirection direction)
 	if (scan->finished)
 		return false;
 
-	Assert(!INDEX_SCAN_BATCH_FULL(scan));
+	/*
+	 * If we already used the maximum number of batch slots available, it's
+	 * pointless to try loading another one. This can happen for various
+	 * reasons, e.g. for index-only scans on all-visible table, or skipping
+	 * duplicate blocks on perfectly correlated indexes, etc.
+	 *
+	 * We could enlarge the array to allow more batches, but that's futile, we
+	 * can always construct a case using more memory. Not only it would risk
+	 * OOM, it'd also be inefficient because this happens early in the scan
+	 * (so it'd interfere with LIMIT queries).
+	 */
+	if (INDEX_SCAN_BATCH_FULL(scan))
+	{
+		DEBUG_LOG("batch_getnext: ran out of space for batches");
+		scan->batchqueue->reset = true;
+		return false;
+	}
 
 	batch_debug_print_batches("batch_getnext / start", scan);
 
@@ -148,6 +169,17 @@ batch_getnext(IndexScanDesc scan, ScanDirection direction)
 
 		DEBUG_LOG("batch_getnext headBatch %d nextBatch %d batch %p",
 				  batchqueue->headBatch, batchqueue->nextBatch, batch);
+
+		/* Delay initializing stream until reading from scan's second batch */
+		if (priorbatch && !scan->xs_heapfetch->rs && !batchqueue->disabled &&
+			!scan->xs_want_itup &&	/* XXX prefetching disabled for IoS, for
+									 * now */
+			enable_indexscan_prefetch)
+			scan->xs_heapfetch->rs =
+				read_stream_begin_relation(READ_STREAM_DEFAULT, NULL,
+										   scan->heapRelation, MAIN_FORKNUM,
+										   scan->heapRelation->rd_tableam->index_getnext_stream,
+										   scan, 0);
 	}
 	else
 		scan->finished = true;
@@ -180,9 +212,12 @@ index_batch_reset(IndexScanDesc scan, bool complete)
 	batch_assert_batches_valid(scan);
 	batch_debug_print_batches("index_batch_reset", scan);
 	Assert(scan->xs_heapfetch);
+	if (scan->xs_heapfetch->rs)
+		read_stream_reset(scan->xs_heapfetch->rs);
 
 	/* reset the positions */
 	batch_reset_pos(&batchqueue->readPos);
+	batch_reset_pos(&batchqueue->streamPos);
 
 	/*
 	 * With "complete" reset, make sure to also free the marked batch, either
@@ -228,6 +263,8 @@ index_batch_reset(IndexScanDesc scan, bool complete)
 	batchqueue->nextBatch = 0;	/* initial batch is empty */
 
 	scan->finished = false;
+	batchqueue->reset = false;
+	batchqueue->currentPrefetchBlock = InvalidBlockNumber;
 
 	batch_assert_batches_valid(scan);
 }
@@ -291,9 +328,13 @@ index_batch_restore_pos(IndexScanDesc scan)
 {
 	BatchQueue *batchqueue = scan->batchqueue;
 	BatchQueueItemPos *markPos = &batchqueue->markPos;
-	BatchQueueItemPos *readPos = &batchqueue->readPos;
 	BatchIndexScan markBatch = batchqueue->markBatch;
 
+	/*
+	 * XXX Disable this optimization when I/O prefetching is in use, at least
+	 * until the possible interactions with streamPos are fully understood.
+	 */
+#if 0
 	if (readPos->batch == markPos->batch &&
 		readPos->batch == batchqueue->headBatch)
 	{
@@ -304,6 +345,7 @@ index_batch_restore_pos(IndexScanDesc scan)
 		readPos->item = markPos->item;
 		return;
 	}
+#endif
 
 	/*
 	 * Call amposreset to let index AM know to invalidate any private state
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index a39cc793b..37a0e6a3f 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -144,6 +144,7 @@ int			max_parallel_workers_per_gather = 2;
 
 bool		enable_seqscan = true;
 bool		enable_indexscan = true;
+bool		enable_indexscan_prefetch = true;
 bool		enable_indexonlyscan = true;
 bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index f1b88d058..2a06279f5 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -99,6 +99,7 @@ struct ReadStream
 	int16		forwarded_buffers;
 	int16		pinned_buffers;
 	int16		distance;
+	int16		distance_old;
 	int16		initialized_buffers;
 	int			read_buffers_flags;
 	bool		sync_mode;		/* using io_method=sync */
@@ -464,6 +465,7 @@ read_stream_look_ahead(ReadStream *stream)
 		if (blocknum == InvalidBlockNumber)
 		{
 			/* End of stream. */
+			stream->distance_old = stream->distance;
 			stream->distance = 0;
 			break;
 		}
@@ -862,6 +864,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		else
 		{
 			/* No more blocks, end of stream. */
+			stream->distance_old = stream->distance;
 			stream->distance = 0;
 			stream->oldest_buffer_index = stream->next_buffer_index;
 			stream->pinned_buffers = 0;
@@ -1046,6 +1049,9 @@ read_stream_reset(ReadStream *stream)
 	int16		index;
 	Buffer		buffer;
 
+	/* remember the old distance (if we reset before end of the stream) */
+	stream->distance_old = Max(stream->distance, stream->distance_old);
+
 	/* Stop looking ahead. */
 	stream->distance = 0;
 
@@ -1078,8 +1084,12 @@ read_stream_reset(ReadStream *stream)
 	Assert(stream->pinned_buffers == 0);
 	Assert(stream->ios_in_progress == 0);
 
-	/* Start off assuming data is cached. */
-	stream->distance = 1;
+	/*
+	 * Restore the old distance, if we have one. Otherwise start assuming data
+	 * is cached.
+	 */
+	stream->distance = Max(1, stream->distance_old);
+	stream->distance_old = 0;
 }
 
 /*
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 3b9d83490..3f264f1ce 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -882,6 +882,13 @@
   boot_val => 'true',
 },
 
+{ name => 'enable_indexscan_prefetch', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
+  short_desc => 'Enables prefetching for index scans and index-only-scans.',
+  flags => 'GUC_EXPLAIN',
+  variable => 'enable_indexscan_prefetch',
+  boot_val => 'true',
+},
+
 { name => 'enable_material', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
   short_desc => 'Enables the planner\'s use of materialization.',
   flags => 'GUC_EXPLAIN',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index dc9e2255f..da50ae15f 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -412,6 +412,7 @@
 #enable_incremental_sort = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
+#enable_indexscan_prefetch = on
 #enable_material = on
 #enable_memoize = on
 #enable_mergejoin = on
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 0411db832..a2a8c3afa 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -159,6 +159,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexscan_prefetch      | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(25 rows)
+(26 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
-- 
2.51.0