From eb12862a298faa878d9404c252c6675b2590bbbc Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sat, 15 Nov 2025 14:03:58 -0500
Subject: [PATCH v10 05/11] Add prefetching to index scans using batch
 interfaces.

This commit implements I/O prefetching for index scans, made possible by
the recent addition of batching interfaces to both the table AM and
index AM APIs.

The amgetbatch index AM interface provides batches of matching TIDs
(rather than one tuple at a time), each of which must be taken from
index tuples that appear together on a single index page.  This allows
multiple batches to be held open simultaneously.  Giving the table AM an
explicit understanding of index AM concepts/index page boundaries allows
it to consider all of the relevant costs and benefits.

Prefetching is implemented using a prefetching position under the
control of the table AM and core code.  This is closely related to the
scan position added by commit FIXME, which introduced the amgetbatch
interface.  A read stream callback advances the read stream as needed to
provide sufficiently many heap block numbers to maintain the read
stream's target prefetch distance.

Testing has shown that index prefetching can make index scans much
faster.  Large range scans that return many tuples can be as much as 35x
faster.

An important goal of the amgetbatch design is to enable the table AM's
read stream callback to advance its prefetch position using TIDs that
appear on a leaf page that's ahead of the current scan position's leaf
page.  This is crucial with scans of indexes where each leaf page
happens to have relatively few distinct heap blocks among its matching
TIDs (as well as with scans with leaf pages that have relatively few
total matching items).  Index scans can have as many as 64 open batches,
which testing has shown to be about the maximum number that can ever be
useful.  Batches are maintained in scan order using a simple ring buffer
data structure.

In rare cases where the scan exceeds this quasi-arbitrary limit of 64,
the read stream is temporarily paused.  Prefetching (via the read
stream) is resumed only after the scan position advances beyond its
current open batch and then frees it by calling amfreebatch and removing
it from the scan's batch ring buffer.  Testing has shown that it isn't
very common for scans to hold open more than about 10 batches to get the
desired I/O prefetch distance.

Author: Tomas Vondra <tomas@vondra.me>
Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Andres Freund <andres@anarazel.de>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Discussion: https://postgr.es/m/cf85f46f-b02f-05b2-5248-5000b894ebab@enterprisedb.com
---
 src/include/access/heapam.h                   |  15 +
 src/include/access/relscan.h                  |  50 +-
 src/include/optimizer/cost.h                  |   1 +
 src/include/storage/read_stream.h             |   1 +
 src/backend/access/heap/heapam_handler.c      | 444 +++++++++++++++++-
 src/backend/access/index/indexam.c            |   4 +-
 src/backend/access/index/indexbatch.c         |  55 ++-
 src/backend/access/nbtree/README              |   2 +-
 src/backend/optimizer/path/costsize.c         |   1 +
 src/backend/storage/aio/read_stream.c         |  12 +
 src/backend/utils/misc/guc_parameters.dat     |   7 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 doc/src/sgml/config.sgml                      |  21 +
 doc/src/sgml/indexam.sgml                     | 386 +++++++++++++--
 doc/src/sgml/ref/create_table.sgml            |  13 +-
 doc/src/sgml/tableam.sgml                     |   8 +
 src/test/regress/expected/sysviews.out        |   3 +-
 17 files changed, 959 insertions(+), 65 deletions(-)

diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 344471f9c..fb8abf85c 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -123,6 +123,21 @@ typedef struct IndexFetchHeapData
 	Buffer		vmbuf;			/* visibility map buffer */
 	int			xs_vm_items;	/* items to resolve during visibility checks */
 
+	/* For batch index scans that use read stream for prefetching */
+	ReadStream *xs_read_stream;
+
+	/*
+	 * The read stream is allocated at the beginning of the scan and reset on
+	 * rescan or when the scan direction changes. The scan direction is saved
+	 * each time a new tuple is requested. If the scan direction changes from
+	 * one tuple to the next, the read stream releases all previously pinned
+	 * buffers and resets the prefetch block.
+	 */
+	ScanDirection xs_read_stream_dir;	/* index scan direction */
+	BlockNumber xs_prefetch_block;	/* last block returned to xs_read_stream */
+	bool		xs_yield_check; /* checked if prefetching should yield? */
+	bool		xs_paused;		/* paused until next batch is read? */
+
 	/* NB: if xs_cbuf or vmbuf are not InvalidBuffer, we hold a pin */
 } IndexFetchHeapData;
 
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 0083de735..27d64a363 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -233,8 +233,14 @@ typedef struct IndexScanBatchData *IndexScanBatch;
  * Maximum number of batches (leaf pages) we can keep in memory.  We need a
  * minimum of two, since we'll only consider releasing one batch when another
  * is read.
+ *
+ * The choice of 64 batches is arbitrary.  It's about 1MB of data with 8KB
+ * pages (512kB for pages, and then a bit of overhead). We should not really
+ * need this many batches in most cases, though. The read stream looks ahead
+ * just enough to queue enough IOs, adjusting the distance (TIDs, but
+ * ultimately the number of future batches) to meet that.
  */
-#define INDEX_SCAN_MAX_BATCHES		2
+#define INDEX_SCAN_MAX_BATCHES		64
 #define INDEX_SCAN_CACHE_BATCHES	2
 
 /*
@@ -246,12 +252,21 @@ typedef struct IndexScanBatchData *IndexScanBatch;
  * matches in.  However, table AMs are free to fetch table tuples in whatever
  * order is most convenient/efficient -- provided that such reordering cannot
  * affect the order that table_index_getnext_slot later returns tuples in.
+ *
+ * This data structure also provides table AMs with a way to read ahead of the
+ * current read position by _multiple_ batches/index pages.  The further out
+ * the table AM reads ahead like this, the further it can see into the future.
+ * That way the table AM is able to reorder work as aggressively as desired.
+ * For example, index scans sometimes need to readahead by as many as a few
+ * dozen amgetbatch batches in order to maintain an optimal I/O prefetch
+ * distance (distance for reading table blocks/fetching table tuples).
  */
 typedef struct BatchRingBuffer
 {
 	/* current positions in batches[] for scan */
 	BatchRingItemPos scanPos;	/* scan's read position */
 	BatchRingItemPos markPos;	/* mark/restore position */
+	BatchRingItemPos prefetchPos;	/* prefetching position */
 
 	IndexScanBatch markBatch;
 
@@ -431,6 +446,39 @@ index_scan_batch_append(IndexScanDescData *scan, IndexScanBatch batch)
 	ringbuf->nextBatch++;
 }
 
+/*
+ * Compare two batch ring positions in the given scan direction.
+ *
+ * Returns negative if pos1 is behind pos2, 0 if equal, positive if pos1 is
+ * ahead of pos2.
+ */
+static inline int
+index_scan_pos_cmp(BatchRingItemPos *pos1, BatchRingItemPos *pos2,
+				   ScanDirection direction)
+{
+	int8		batchdiff = (int8) (pos1->batch - pos2->batch);
+
+	if (batchdiff != 0)
+		return batchdiff;
+
+	/* Same batch, compare items */
+	if (ScanDirectionIsForward(direction))
+		return pos1->item - pos2->item;
+	else
+		return pos2->item - pos1->item;
+}
+
+/*
+ * Return the signed distance in batches between two positions.
+ *
+ * Positive means pos1 is ahead of pos2 by that many batches.
+ */
+static inline int8
+index_scan_pos_batch_distance(BatchRingItemPos *pos1, BatchRingItemPos *pos2)
+{
+	return (int8) (pos1->batch - pos2->batch);
+}
+
 /*
  * Advance position to its next item in the batch.
  *
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index f2fd5d315..419300a6b 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -52,6 +52,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather;
 extern PGDLLIMPORT bool enable_seqscan;
 extern PGDLLIMPORT bool enable_indexscan;
 extern PGDLLIMPORT bool enable_indexonlyscan;
+extern PGDLLIMPORT bool enable_indexscan_prefetch;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h
index e3b6bb2f3..9c85170ab 100644
--- a/src/include/storage/read_stream.h
+++ b/src/include/storage/read_stream.h
@@ -102,6 +102,7 @@ extern ReadStream *read_stream_begin_smgr_relation(int flags,
 extern BlockNumber read_stream_pause(ReadStream *stream);
 extern void read_stream_resume(ReadStream *stream);
 extern BlockNumber read_stream_yield(ReadStream *stream);
+extern bool read_stream_uses_fast_path(ReadStream *stream);
 extern void read_stream_reset(ReadStream *stream);
 extern void read_stream_end(ReadStream *stream);
 
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 350d827ea..8ce749873 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -37,7 +37,9 @@
 #include "commands/progress.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "pgstat.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/bufpage.h"
 #include "storage/lmgr.h"
@@ -60,6 +62,9 @@ static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
 static bool BitmapHeapScanNextBlock(TableScanDesc scan,
 									bool *recheck,
 									uint64 *lossy_pages, uint64 *exact_pages);
+static BlockNumber heapam_getnext_stream(ReadStream *stream,
+										 void *callback_private_data,
+										 void *per_buffer_data);
 
 
 /* ------------------------------------------------------------------------
@@ -111,6 +116,17 @@ heapam_index_fetch_reset(IndexFetchTableData *scan)
 	 * so that rescans don't do an excessive number of VM lookups.
 	 */
 	hscan->xs_vm_items = 1;
+
+	/* Reset read stream state */
+	hscan->xs_read_stream_dir = NoMovementScanDirection;	/* read_stream_reset
+															 * needs this */
+	hscan->xs_prefetch_block = InvalidBlockNumber;
+	hscan->xs_yield_check = false;
+	hscan->xs_paused = false;
+
+	/* Reset read stream itself */
+	if (hscan->xs_read_stream)
+		read_stream_reset(hscan->xs_read_stream);
 }
 
 static void
@@ -120,6 +136,9 @@ heapam_index_fetch_end(IndexFetchTableData *scan)
 
 	heapam_index_fetch_reset(scan);
 
+	if (hscan->xs_read_stream)
+		read_stream_end(hscan->xs_read_stream);
+
 	if (hscan->vmbuf != InvalidBuffer)
 		ReleaseBuffer(hscan->vmbuf);
 
@@ -150,7 +169,14 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 		if (BufferIsValid(hscan->xs_cbuf))
 			ReleaseBuffer(hscan->xs_cbuf);
 
-		hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk);
+		/*
+		 * When using a read stream, the stream will already know which block
+		 * number comes next (though an assertion will verify a match below)
+		 */
+		if (hscan->xs_read_stream)
+			hscan->xs_cbuf = read_stream_next_buffer(hscan->xs_read_stream, NULL);
+		else
+			hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk);
 
 		/*
 		 * Prune page when it is pinned for the first time
@@ -208,6 +234,42 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
  * (important for inner index scans of anti-joins and semi-joins), and the
  * need to not hold onto index leaf pages for too long.
  *
+ * Dropping leaf page pins early
+ * -----------------------------
+ *
+ * In no event will the scan be allowed to hold onto more than one batch's
+ * leaf page pin at a time.  The primary reason for this restriction is to
+ * avoid unintended interactions with the read stream, which has its own
+ * strategy for keeping the number of pins held by the backend under control.
+ *
+ * Once we've resolved visibility for all items in a batch, we can safely drop
+ * its leaf page pin.  This is safe with respect to concurrent VACUUM because
+ * index vacuuming will block on acquiring a conflicting cleanup lock on the
+ * batch's index page due to our holding a pin on that same page.  Copying the
+ * relevant visibility map data into our local cache suffices to prevent unsafe
+ * concurrent TID recycling: if any of these TIDs point to dead heap tuples,
+ * VACUUM cannot possibly return from ambulkdelete and mark the pointed-to
+ * heap pages as all-visible.  VACUUM _can_ do so once we release the batch's
+ * pin, but that's okay; we'll be working off of cached visibility info that
+ * indicates that the dead TIDs are NOT all-visible.
+ *
+ * Note: We cannot drop the pin early when the scan uses a non-MVCC snapshot;
+ * we must delay it until all heap fetches for the loaded batch have taken
+ * place.  This is why we don't support prefetching during such scans.  See
+ * doc/src/sgml/indexam.sgml.
+ *
+ * Read stream agreement
+ * ---------------------
+ *
+ * heapam_batch_getnext_tid must reliably agree with heapam_getnext_stream
+ * about which heap blocks/TIDs will require a heap fetch (and which TIDs
+ * won't due to pointing to an all-visible heap page).  Otherwise we risk
+ * allowing the read stream to return unexpected heap buffers/pages.
+ *
+ * Caching visibility information up front avoids that problem.  If a VM bit
+ * is concurrently set (or unset), it can't matter, since everybody will work
+ * off of this immutable local cache.
+ *
  * Note on Memory Ordering Effects
  * -------------------------------
  *
@@ -343,11 +405,13 @@ static IndexScanBatch
 heapam_batch_getnext(IndexScanDesc scan, ScanDirection direction,
 					 IndexScanBatch priorBatch, BatchRingItemPos *pos)
 {
+	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan->xs_heapfetch;
 	IndexScanBatch batch = NULL;
 	BatchRingBuffer *batchringbuf PG_USED_FOR_ASSERTS_ONLY = &scan->batchringbuf;
 
 	/* XXX: we should assert that a snapshot is pushed or registered */
 	Assert(TransactionIdIsValid(RecentXmin));
+	Assert(direction == hscan->xs_read_stream_dir);
 
 	if (!priorBatch)
 	{
@@ -372,6 +436,12 @@ heapam_batch_getnext(IndexScanDesc scan, ScanDirection direction,
 	{
 		/*
 		 * Next batch already loaded for us.
+		 *
+		 * This usually happens when heapam_batch_getnext_tid caller finds
+		 * that heapam_getnext_stream already loaded the next required batch.
+		 * But there are also corner cases where it works the other way around
+		 * (cases where heapam_getnext_stream's prefetchPos is slightly behind
+		 * heapam_getnext_stream's scanPos, and catches up here).
 		 */
 		batch = index_scan_batch(scan, pos->batch + 1);
 
@@ -392,6 +462,7 @@ heapam_batch_getnext(IndexScanDesc scan, ScanDirection direction,
 	 * buffer (batches must stay in scan order).  If it isn't then we should
 	 * have already returned some existing loaded batch earlier.
 	 */
+	Assert(!hscan->xs_paused);
 	Assert(!index_scan_batch_full(scan));
 	Assert(!priorBatch ||
 		   (index_scan_batch_count(scan) > 0 && priorBatch->dir == direction &&
@@ -428,6 +499,41 @@ heapam_batch_getnext(IndexScanDesc scan, ScanDirection direction,
 			ReleaseBuffer(batch->buf);
 			batch->buf = InvalidBuffer;
 		}
+
+		/*
+		 * Reset xs_yield_check, to allow heapam_getnext_stream to consider if
+		 * we should yield on our newly acquired batch
+		 */
+		hscan->xs_yield_check = false;
+
+		/*
+		 * Delay initializing stream until reading from scan's second batch.
+		 * This heuristic avoids wasting cycles on starting a read stream for
+		 * very selective index scans.  We can likely improve upon this, but
+		 * it works well enough for now.
+		 *
+		 * Also avoid prefetching during scans where we're unable to drop each
+		 * batch's buffer pin right away (non-MVCC snapshot scans).  We are
+		 * not prepared to sensibly limit the total number of buffer pins held
+		 * (read stream handles all pin resource management for us, and knows
+		 * nothing about pins held on index pages/within batches).
+		 *
+		 * Also delay creating a read stream during index-only scans that
+		 * haven't done any heap fetches yet.  We don't want to waste any
+		 * cycles on allocating a read stream until we have a demonstrated
+		 * need for perform heap fetches.
+		 */
+		if (!hscan->xs_read_stream && priorBatch && scan->MVCCScan &&
+			hscan->xs_blk != InvalidBlockNumber &&	/* for index-only scans */
+			io_method != IOMETHOD_SYNC && enable_indexscan_prefetch)
+		{
+			Assert(!batchringbuf->prefetchPos.valid);
+
+			hscan->xs_read_stream =
+				read_stream_begin_relation(READ_STREAM_DEFAULT, NULL,
+										   scan->heapRelation, MAIN_FORKNUM,
+										   heapam_getnext_stream, scan, 0);
+		}
 	}
 	else
 	{
@@ -452,6 +558,43 @@ heapam_batch_getnext(IndexScanDesc scan, ScanDirection direction,
 	return batch;
 }
 
+/*
+ * Handle a change in index scan direction (at the tuple granularity).
+ *
+ * Resets the read stream, since we can't rely on scanPos continuing to agree
+ * with the blocks that read stream already consumed using prefetchPos.
+ */
+static pg_noinline void
+heapam_dirchange_readstream_reset(IndexFetchHeapData *hscan,
+								  BatchRingBuffer *batchringbuf,
+								  ScanDirection direction)
+{
+	/* Reset read stream state */
+	batchringbuf->prefetchPos.valid = false;
+	hscan->xs_yield_check = false;
+	hscan->xs_paused = false;
+	hscan->xs_read_stream_dir = NoMovementScanDirection;	/* see note below */
+
+	/* Reset read stream itself */
+	if (hscan->xs_read_stream)
+		read_stream_reset(hscan->xs_read_stream);
+
+	/*
+	 * Finally, remember new scan direction.
+	 *
+	 * Note: we needed to set xs_read_stream_dir to NoMovementScanDirection
+	 * momentarily to avoid spuriously prefetching more blocks from within the
+	 * read stream callback.  Once we return, the read stream can be used to
+	 * fetch blocks in the opposite scan direction.
+	 *
+	 * Note: iff the scan _continues_ in this new direction, and actually
+	 * steps off scanBatch to an earlier index page, heapam_batch_getnext will
+	 * deal with it.  But that might never happen; the scan might yet change
+	 * direction again (or just end before returning more items).
+	 */
+	hscan->xs_read_stream_dir = direction;
+}
+
 /* ----------------
  *		heapam_batch_getnext_tid - get next TID from batch ring buffer
  *
@@ -468,9 +611,16 @@ heapam_batch_getnext_tid(IndexScanDesc scan, IndexFetchHeapData *hscan,
 	IndexScanBatch scanBatch = NULL;
 
 	/* scan should only be paused when there's no free batch slots */
+	Assert(!hscan->xs_paused || index_scan_batch_full(scan));
 	Assert(!scanPos->valid || batchringbuf->headBatch == scanPos->batch);
 	Assert(scanPos->valid || index_scan_batch_count(scan) == 0);
 
+	/* Handle resetting the read stream when scan direction changes */
+	if (hscan->xs_read_stream_dir == NoMovementScanDirection)
+		hscan->xs_read_stream_dir = direction;	/* first call */
+	else if (unlikely(hscan->xs_read_stream_dir != direction))
+		heapam_dirchange_readstream_reset(hscan, batchringbuf, direction);
+
 	/*
 	 * Check if there's an existing loaded scanBatch for us to return the next
 	 * matching item's TID/index tuple from
@@ -479,7 +629,7 @@ heapam_batch_getnext_tid(IndexScanDesc scan, IndexFetchHeapData *hscan,
 	{
 		/*
 		 * scanPos is valid, so scanBatch must already be loaded in batch ring
-		 * buffer.  We rely on that here.
+		 * buffer.  We rely on that here (can't do this with prefetchBatch).
 		 */
 		Assert(batchringbuf->headBatch == scanPos->batch);
 
@@ -522,21 +672,309 @@ heapam_batch_getnext_tid(IndexScanDesc scan, IndexFetchHeapData *hscan,
 	{
 		IndexScanBatch headBatch = index_scan_batch(scan,
 													batchringbuf->headBatch);
+		BatchRingItemPos *prefetchPos = &batchringbuf->prefetchPos;
 
 		/* Also free obsolescent head batch (unless it is scan's markBatch) */
 		tableam_util_free_batch(scan, headBatch);
 
+		/*
+		 * If we're about to release the batch that prefetchPos currently
+		 * points to, just invalidate prefetchPos.  We'll reinitialize it
+		 * using scanPos if and when heapam_getnext_stream is next called. (We
+		 * must avoid confusing a prefetchPos->batch that's actually before
+		 * headBatch with one that's after nextBatch due to uint8 overflow;
+		 * simplest way is to invalidate prefetchPos like this.)
+		 */
+		if (prefetchPos->valid &&
+			prefetchPos->batch == batchringbuf->headBatch)
+			prefetchPos->valid = false;
+
 		/* Remove the batch from the ring buffer */
 		batchringbuf->headBatch++;
+
+		if (hscan->xs_paused)
+		{
+			/*
+			 * The scan's read stream was paused by heapam_getnext_stream due
+			 * to exhausting all available free batch slots.  We just freed up
+			 * one such slot now, though.  Resume the read stream to re-enable
+			 * prefetching.
+			 */
+			Assert(!index_scan_batch_full(scan));
+			read_stream_resume(hscan->xs_read_stream);
+			hscan->xs_paused = false;
+		}
 	}
 
 	/* In practice scanBatch will always be the ring buffer's headBatch */
 	Assert(batchringbuf->headBatch == scanPos->batch);
-	Assert(!hscan->xs_yielded && !hscan->xs_paused);
+	Assert(!hscan->xs_paused);
 
 	return heapam_batch_return_tid(scan, scanBatch, scanPos);
 }
 
+/*
+ * heapam_getnext_stream
+ *		return the next block to pass to the read stream
+ *
+ * The initial batch is always loaded by heapam_batch_getnext_tid.  We don't
+ * get called until the first read_stream_next_buffer() call, when a heap
+ * block is requested from the scan's stream for the first time.
+ *
+ * The position of the read_stream is stored in prefetchPos.  It is typical for
+ * prefetchPos to consistently stay ahead of the scanPos position that's used to
+ * track the next TID to be returned to the scan by heapam_batch_getnext_tid
+ * after the first time we get called.  However, that isn't a precondition.
+ * There is a strict postcondition, though: when we return we'll always leave
+ * scanPos <= prefetchPos (except in cases where we return InvalidBlockNumber).
+ */
+static BlockNumber
+heapam_getnext_stream(ReadStream *stream, void *callback_private_data,
+					  void *per_buffer_data)
+{
+	IndexScanDesc scan = (IndexScanDesc) callback_private_data;
+	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan->xs_heapfetch;
+	BatchRingBuffer *batchringbuf = &scan->batchringbuf;
+	BatchRingItemPos *scanPos = &batchringbuf->scanPos;
+	BatchRingItemPos *prefetchPos = &batchringbuf->prefetchPos;
+	ScanDirection xs_read_stream_dir = hscan->xs_read_stream_dir;
+	IndexScanBatch prefetchBatch;
+	bool		fromScanPos = false;
+
+	Assert(!hscan->xs_paused);
+
+	if (xs_read_stream_dir == NoMovementScanDirection)
+	{
+		/*
+		 * Called from read_stream_reset or read_stream_end.  Don't return
+		 * additional heap blocks (at least until the scan direction is set to
+		 * ForwardScanDirection/BackwardScanDirection once again).
+		 *
+		 * XXX We only need this handling to work around a bug in the
+		 * read_stream_pause mechanism.
+		 */
+		return InvalidBlockNumber;
+	}
+
+	/*
+	 * scanPos must always be valid when prefetching takes place.  There has
+	 * to be at least one batch, loaded as our scanBatch.
+	 */
+	Assert(index_scan_batch_count(scan) > 0);
+	Assert(scanPos->valid);
+
+	/*
+	 * prefetchPos might not yet be valid.  It might have also fallen behind
+	 * scanPos.  Deal with both.
+	 *
+	 * If prefetchPos has not been initialized yet, that typically indicates
+	 * that this is the first call here for the entire scan.  We initialize
+	 * prefetchPos using the current scanPos, since the current scanBatch
+	 * item's TID should have its block number returned by the read stream
+	 * first.  It's likely that prefetchPos will get ahead of scanPos before
+	 * long, but that hasn't happened yet.
+	 *
+	 * It's also possible for prefetchPos to "fall behind" scanPos, at least
+	 * in a trivial sense: if many adjacent items are returned that contain
+	 * TIDs that point to the same heap block, scanPos can actually overtake
+	 * prefetchPos (prefetchPos can't advance until the scan actually calls
+	 * read_stream_next_buffer).  Reinitializing from scanPos is enough to
+	 * ensure that prefetchPos still fetches the next heap block that scanPos
+	 * will require (prefetchPos can never fall behind "by more than one group
+	 * of items that all point to the same heap block", so this is safe).
+	 *
+	 * Note: when heapam_batch_getnext_tid frees a batch that prefetchPos
+	 * points to, it'll invalidate prefetchPos for us.  This removes any
+	 * danger of prefetchPos.batch falling so far behind scanPos.batch that it
+	 * wraps around (and appears to be ahead of scanPos instead of behind it).
+	 */
+	if (!prefetchPos->valid ||
+		index_scan_pos_cmp(prefetchPos, scanPos, xs_read_stream_dir) < 0)
+	{
+		hscan->xs_prefetch_block = InvalidBlockNumber;
+		*prefetchPos = *scanPos;
+		fromScanPos = true;
+	}
+
+	/*
+	 * Consider if we need to yield, if we haven't done so already for this
+	 * prefetchBatch.  When we yield, the scan will return at least one more
+	 * scanPos item, often several more (particularly during scans of indexes
+	 * with high physical/logical correlation, where batches contain groups of
+	 * adjacent items whose TIDs all point to the same heap page).
+	 *
+	 * Prefetching can continue once the scan requests the buffer for the next
+	 * enqueued heap block by calling read_stream_next_buffer.  We weigh the
+	 * need to keep the scan responsive (to avoid senselessly doing a large
+	 * amount of work when we should just return a scanPos tuple immediately)
+	 * against the need for the read stream to maintain its prefetch distance
+	 * (if we pause too much it'll hurt the stream's ability to maintain a
+	 * sufficient prefetch distance when I/O bound).
+	 *
+	 * Keeping the scan responsive is important during index-only scans that
+	 * require only a few heap fetches.  It also matters when allowing the
+	 * scan to return just a few more items has the potential to allow the
+	 * scan to end entirely (e.g., with an ORDER BY ... LIMIT N, or within a
+	 * scan that feeds into a merge join).
+	 */
+	else if (!hscan->xs_yield_check)
+	{
+		int8		batchdistance = index_scan_pos_batch_distance(prefetchPos,
+																  scanPos);
+
+		/*
+		 * We haven't yielded for this prefetchBatch yet. Should we?
+		 *
+		 * During I/O bound scans it's typical for the distance between
+		 * scanPos and prefetchPos to grow rapidly at first, and then
+		 * stabilize at some fixed level for the remainder of the scan.
+		 */
+		hscan->xs_yield_check = true;
+		if (batchdistance == 0)
+		{
+			/*
+			 * Never actually yield when prefetchBatch is also scanBatch.
+			 * Assume that the scan is either already as responsive as can
+			 * reasonably be expected, or that it's still too early to tell.
+			 */
+		}
+		else if (batchdistance <= 2)
+		{
+			/*
+			 * prefetchBatch is ahead of scanBatch, but only by one or two
+			 * batches.  If the read stream is currently using its fast path,
+			 * yield.  This assumes that the risk of calling amgetbatch too
+			 * eagerly now outweighs the risk of yielding noticeably impeding
+			 * read stream's ability to get to an adequate prefetch distance.
+			 */
+			if (read_stream_uses_fast_path(stream))
+				return read_stream_yield(stream);
+
+			/*
+			 * Don't yield yet -- at least not until batchdistance can grow to
+			 * three or more batches.  There's no point in our yielding if
+			 * heapam_index_fetch_tuple will have to wait for the next block.
+			 */
+		}
+		else
+		{
+			/*
+			 * prefetchBatch is ahead of scanBatch by three or more batches,
+			 * so yield unconditionally.  (The risk of yielding impeding read
+			 * stream's ability to maintain an adequate prefetch distance is
+			 * relatively small.)
+			 */
+			return read_stream_yield(stream);
+		}
+	}
+
+	prefetchBatch = index_scan_batch(scan, prefetchPos->batch);
+	for (;;)
+	{
+		BatchMatchingItem *item;
+		BlockNumber prefetch_block;
+
+		if (fromScanPos)
+		{
+			/*
+			 * Don't increment item when prefetchPos was just initialized
+			 * using scanPos.  We'll return the scanPos item's heap block
+			 * directly on the first call here.  In other words, we'll return
+			 * the heap block for the TID passed to heapam_index_fetch_tuple
+			 * at the point where it called read_stream_next_buffer for the
+			 * first time during the scan.
+			 */
+			fromScanPos = false;
+		}
+		else if (!index_scan_pos_advance(xs_read_stream_dir,
+										 prefetchBatch, prefetchPos))
+		{
+			/*
+			 * Ran out of items from prefetchBatch.  Try to advance to the
+			 * scan's next batch.
+			 */
+			if (unlikely(index_scan_batch_full(scan)))
+			{
+				/*
+				 * Can't advance prefetchBatch because all available
+				 * batchringbuf batch slots are currently in use.
+				 *
+				 * Deal with this by momentarily pausing the read stream.
+				 * heapam_batch_getnext_tid will resume the read stream later,
+				 * though only after scanPos has consumed all remaining items
+				 * from scanBatch (at which point scanBatch will be freed,
+				 * making its slot available for reuse by a later batch).
+				 *
+				 * In practice we hardly ever need to do this (it happens far
+				 * less often than yielding).  It would be possible to avoid
+				 * the need to pause the read stream by dynamically allocating
+				 * slots, but that would add complexity for no real benefit.
+				 */
+				hscan->xs_paused = true;
+				hscan->xs_yield_check = true;	/* skip check on next call
+												 * here */
+				return read_stream_pause(stream);
+			}
+
+			prefetchBatch = heapam_batch_getnext(scan, xs_read_stream_dir,
+												 prefetchBatch, prefetchPos);
+			if (!prefetchBatch)
+			{
+				/*
+				 * Failed to load next batch, so all the batches that the scan
+				 * will ever require (barring a change in scan direction) are
+				 * now loaded
+				 */
+				return InvalidBlockNumber;
+			}
+
+			/* Position prefetchPos to the start of new prefetchBatch */
+			index_scan_pos_nextbatch(xs_read_stream_dir,
+									 prefetchBatch, prefetchPos);
+		}
+
+		/*
+		 * prefetchPos now points to the next item whose TID's heap block
+		 * number might need to be prefetched
+		 */
+		Assert(index_scan_batch(scan, prefetchPos->batch) == prefetchBatch);
+
+		/* scanPos is always <= prefetchPos when we return */
+		Assert(index_scan_pos_cmp(scanPos, prefetchPos, xs_read_stream_dir) <= 0);
+		Assert(prefetchPos->item >= prefetchBatch->firstItem &&
+			   prefetchPos->item <= prefetchBatch->lastItem);
+
+		item = &prefetchBatch->items[prefetchPos->item];
+		prefetch_block = ItemPointerGetBlockNumber(&item->heapTid);
+
+		if (scan->xs_want_itup)
+		{
+			/* make sure we have visibility info for the item */
+			heapam_batch_resolve_visibility(scan, prefetchBatch, prefetchPos);
+
+			/* item is known to be all-visible; prefetching isn't required */
+			if (item->allVisible)
+				continue;
+		}
+
+		if (prefetch_block == hscan->xs_prefetch_block)
+		{
+			/*
+			 * prefetch_block matches the last prefetchPos item's TID's heap
+			 * block number; we must not return the same prefetch_block twice
+			 * (twice in succession)
+			 */
+			continue;
+		}
+
+		/* We have a new heap block number to return to read stream */
+		hscan->xs_prefetch_block = prefetch_block;
+		return prefetch_block;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /* ----------------
  *		index_fetch_heap - get the scan's next heap tuple
  *
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index e35d6cf42..82e9ab35f 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -379,7 +379,7 @@ index_rescan(IndexScanDesc scan,
 	Assert(nkeys == scan->numberOfKeys);
 	Assert(norderbys == scan->numberOfOrderBys);
 
-	/* Release resources (like buffer pins) from table accesses */
+	/* reset read stream and release buffer pins from table accesses */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
@@ -468,7 +468,7 @@ index_restrpos(IndexScanDesc scan)
 	CHECK_SCAN_PROCEDURE(amgetbatch);
 	CHECK_SCAN_PROCEDURE(amposreset);
 
-	/* release resources (like buffer pins) from table accesses */
+	/* reset read stream and release buffer pins from table accesses */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
diff --git a/src/backend/access/index/indexbatch.c b/src/backend/access/index/indexbatch.c
index f44bf979b..fe72ca266 100644
--- a/src/backend/access/index/indexbatch.c
+++ b/src/backend/access/index/indexbatch.c
@@ -10,7 +10,10 @@
  * approach enables efficient prefetching of table AM blocks during ordered
  * index scans.
  *
- * The ring buffer loads batches in index key space order.
+ * The ring buffer loads batches in index key space order.  This allows the
+ * table AM to maintain an adequate prefetch distance: its read stream
+ * callback is thereby able to request table blocks referenced by index pages
+ * that are well ahead of the current scan position's index page.
  *
  * There's three types of functions in this module:
  *
@@ -28,6 +31,31 @@
  *    AMs that implement the amgetbatch interface.  These manage batch
  *    allocation, index page buffer lock release, and batch memory recycling.
  *
+ * These three layers coordinate without explicit coupling: the core lifecycle
+ * functions assume that table AMs use scanPos/scanBatch and prefetchPos/
+ * prefetchBatch in a standardized way (see heapam_handler.c for the reference
+ * implementation), while table AMs assume that index AMs free and unlock
+ * batches according to the conventions established here.  See indexam.sgml
+ * for the full specification of the amgetbatch/amfreebatch contract.
+ *
+ * The table AM fully controls the read stream as its own private state.
+ * When the scan direction changes, the table AM must immediately reset its
+ * read stream -- blocks already requested via prefetchPos will no longer
+ * match what scanPos needs to return.
+ *
+ * Crossing a batch boundary in a new scan direction is a separate process,
+ * handled here: table AMs are required to call tableam_util_batch_dirchange
+ * to leave the scan's batch ring buffer in a consistent state.  The current
+ * implementation handles this by simply discarding most batches.  The key
+ * invariant is that all loaded batches must be in a consistent scan direction
+ * order.  (During cross-batch direction changes, the current scanBatch will
+ * have its IndexScanBatchData.dir flipped, but we have no provision for
+ * keeping all other loaded batches.  It's not clear that it'd be useful to
+ * hold onto them; the scan direction is unlikely to change back.  The scan
+ * batch direction invariant keeps things simple: it is convenient for most
+ * code that deals with batches to be able to assume that the common case
+ * where scan direction never changes is the only case.)
+ *
  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
@@ -63,6 +91,7 @@ index_batchscan_init(IndexScanDesc scan)
 
 	scan->batchringbuf.scanPos.valid = false;
 	scan->batchringbuf.markPos.valid = false;
+	scan->batchringbuf.prefetchPos.valid = false;
 
 	scan->batchringbuf.markBatch = NULL;
 	scan->batchringbuf.headBatch = 0;	/* initial head batch */
@@ -91,6 +120,7 @@ index_batchscan_reset(IndexScanDesc scan, bool complete)
 	Assert(scan->xs_heapfetch);
 
 	batchringbuf->scanPos.valid = false;
+	batchringbuf->prefetchPos.valid = false;
 
 	/*
 	 * When called with "complete" we must make sure that markBatch is freed,
@@ -249,7 +279,13 @@ index_batchscan_mark_pos(IndexScanDesc scan)
  * the current scanBatch when needed.
  *
  * We just discard all batches (other than markBatch/restored scanBatch),
- * except when markBatch is already the scan's current scanBatch.
+ * except when markBatch is already the scan's current scanBatch.  We always
+ * invalidate prefetchPos.  The read stream and related prefetching state are
+ * reset by table_index_fetch_reset(), called before this function.  This
+ * approach keeps things simple for table AMs: most code that deals with
+ * batches is thereby able to assume that the common case where scan direction
+ * never changes is the only case (tableam_util_batch_dirchange takes a
+ * similar approach to handling a cross-batch change in scan direction).
  */
 void
 index_batchscan_restore_pos(IndexScanDesc scan)
@@ -260,6 +296,14 @@ index_batchscan_restore_pos(IndexScanDesc scan)
 	IndexScanBatch markBatch = batchringbuf->markBatch;
 	IndexScanBatch scanBatch = index_scan_batch(scan, scanPos->batch);
 
+	/*
+	 * Restoring a mark always requires stopping prefetching.  This is similar
+	 * to the handling table AMs implement to deal with a tuple-level change
+	 * in the scan's direction.  The read stream must have already been reset
+	 * by the caller (via table_index_fetch_reset).
+	 */
+	batchringbuf->prefetchPos.valid = false;
+
 	if (scanBatch == markBatch)
 	{
 		/* markBatch is already scanBatch; needn't change batchringbuf */
@@ -320,6 +364,13 @@ index_batchscan_restore_pos(IndexScanDesc scan)
  * point on batchringbuf will look as if our new scan direction had been used
  * from the start.  This approach isn't particularly efficient, but it works
  * well enough for what ought to be a relatively rare occurrence.
+ *
+ * Caller must have reset the scan's read stream before calling here.  That
+ * needs to happen as soon as the scan requests a tuple in whatever scan
+ * direction is opposite-to-current.  We only deal with the case where the
+ * scan backs up by enough items to cross a batch boundary (when the scan
+ * resumes scanning in its original direction/ends before crossing a boundary,
+ * there isn't any need to call here).
  */
 void
 tableam_util_batch_dirchange(IndexScanDesc scan)
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index e75577a7e..3939391ae 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -186,7 +186,7 @@ interface.  (See also, doc/src/sgml/indexam.sgml).
 Blocking VACUUM like this can be disruptive, so table AMs avoid it whenever
 possible.  The heap table AM usually drops leaf page pins right away, though
 not during scans that use a non-MVCC snapshot.  Index-only scans may also
-retain pins in some cases.
+retain pins in some cases, though prefetching requires dropping them.
 
 Opportunistic index tuple deletion performs the same page-level
 modifications as VACUUM, while only holding an exclusive lock.  This is
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index c30d6e846..9ee20f4dc 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -145,6 +145,7 @@ int			max_parallel_workers_per_gather = 2;
 bool		enable_seqscan = true;
 bool		enable_indexscan = true;
 bool		enable_indexonlyscan = true;
+bool		enable_indexscan_prefetch = true;
 bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
 bool		enable_sort = true;
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 0dbec2abb..7d619d217 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -1082,6 +1082,18 @@ read_stream_yield(ReadStream *stream)
 	return InvalidBlockNumber;
 }
 
+/*
+ * Does caller's read stream currently use the fast path?
+ *
+ * The read stream uses the fast path during all-cached scans (as well as
+ * during subsets of a scan where all buffers/pages returned are cached).
+ */
+bool
+read_stream_uses_fast_path(ReadStream *stream)
+{
+	return stream->fast_path;
+}
+
 /*
  * Reset a read stream by releasing any queued up buffers, allowing the stream
  * to be used again for different blocks.  This can be used to clear an
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 762b8efe6..9b878144c 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -891,6 +891,13 @@
   boot_val => 'true',
 },
 
+{ name => 'enable_indexscan_prefetch', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
+  short_desc => 'Enables prefetching for index scans and index-only-scans.',
+  flags => 'GUC_EXPLAIN',
+  variable => 'enable_indexscan_prefetch',
+  boot_val => 'true',
+},
+
 { name => 'enable_material', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
   short_desc => 'Enables the planner\'s use of materialization.',
   flags => 'GUC_EXPLAIN',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 6e82c8e05..73ffa7f5c 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -416,6 +416,7 @@
 #enable_incremental_sort = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
+#enable_indexscan_prefetch = on
 #enable_material = on
 #enable_memoize = on
 #enable_mergejoin = on
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 373429869..ab5129a03 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -5668,6 +5668,27 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-indexscan-prefetch" xreflabel="enable_indexscan_prefetch">
+      <term><varname>enable_indexscan_prefetch</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_indexscan_prefetch</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables prefetching for index-scan and index-only-scan
+        plan types.  Prefetching can improve performance by reading table AM
+        pages ahead of when they are needed during index scans.  The default
+        is <literal>on</literal>.
+       </para>
+       <para>
+        Index prefetching is always disabled when
+        <xref linkend="guc-io-method"/> is set to <literal>sync</literal>,
+        regardless of this setting.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-material" xreflabel="enable_material">
       <term><varname>enable_material</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index f48da3185..69e70af40 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -167,10 +167,11 @@ typedef struct IndexAmRoutine
     ambeginscan_function ambeginscan;
     amrescan_function amrescan;
     amgettuple_function amgettuple;     /* can be NULL */
+    amgetbatch_function amgetbatch; /* can be NULL */
+    amfreebatch_function amfreebatch;	/* can be NULL */
     amgetbitmap_function amgetbitmap;   /* can be NULL */
     amendscan_function amendscan;
-    ammarkpos_function ammarkpos;       /* can be NULL */
-    amrestrpos_function amrestrpos;     /* can be NULL */
+    amposreset_function amposreset; /* can be NULL */
 
     /* interface functions to support parallel index scans */
     amestimateparallelscan_function amestimateparallelscan;    /* can be NULL */
@@ -678,6 +679,10 @@ ambeginscan (Relation indexRelation,
    <function>ambeginscan</function> does little beyond making that call and perhaps
    acquiring locks;
    the interesting parts of index-scan startup are in <function>amrescan</function>.
+   Index access methods that use the <function>amgetbatch</function> interface
+   must also set <literal>scan-&gt;maxitemsbatch</literal> to the maximum number
+   of items that can appear in a single batch (typically derived from the
+   index page size, e.g., <literal>MaxIndexTuplesPerPage</literal>).
   </para>
 
   <para>
@@ -749,6 +754,178 @@ amgettuple (IndexScanDesc scan,
    <structfield>amgettuple</structfield> field in its <structname>IndexAmRoutine</structname>
    struct must be set to NULL.
   </para>
+  <note>
+   <para>
+    As of <productname>PostgreSQL</productname> version 19, position marking
+    and restoration of scans is no longer supported for the
+    <function>amgettuple</function> interface; only the
+    <function>amgetbatch</function> interface supports this feature.
+   </para>
+  </note>
+
+  <para>
+<programlisting>
+IndexScanBatch
+amgetbatch (IndexScanDesc scan,
+            IndexScanBatch priorbatch,
+            ScanDirection direction);
+</programlisting>
+   Return the next batch of index tuples in the given scan, moving in the
+   given direction (forward or backward in the index).  Returns an instance of
+   <type>IndexScanBatch</type> with index tuples loaded, or
+   <literal>NULL</literal> if there are no more index tuples in the given
+   scan direction.
+  </para>
+
+  <para>
+   The <function>amgetbatch</function> interface is an alternative to
+   <function>amgettuple</function> that returns matching index entries in batches
+   rather than one at a time. This enables the table access method to
+   optimize table block access patterns and perform I/O prefetching.
+   By returning all matching index entries from a single index page together,
+   the table AM can readahead through the index and identify which table
+   blocks will be needed, allowing prefetching of table AM pages during
+   ordered index scans.
+  </para>
+
+  <para>
+   The table AM passes the batch most recently returned by
+   <function>amgetbatch</function> for the given scan as
+   <literal>priorbatch</literal> (or <literal>NULL</literal> on the first call
+   for the scan).  The index AM uses information from <literal>priorbatch</literal>
+   to determine which index page to read next.
+  </para>
+
+  <para>
+   A batch returned by <function>amgetbatch</function> is associated with a
+   pinned index page containing at least one matching item/tuple.  The buffer
+   pin can be held onto by the table AM as an interlock against concurrent TID
+   recycling by <command>VACUUM</command>.  See <xref linkend="index-locking"/>
+   for details on buffer pin management during index scans.
+  </para>
+
+  <para>
+   A <type>IndexScanBatch</type> that is returned by
+   <function>amgetbatch</function> is no longer managed by the access method.
+   It is up to the table AM caller to decide when it should be freed by
+   passing it to <function>amfreebatch</function>.  Note also that
+   <function>amgetbatch</function> functions must never modify the
+   <structfield>priorbatch</structfield> parameter.  The core
+   <filename>src/backend/access/nbtree/</filename> and
+   <filename>src/backend/access/hash/</filename> implementations provide
+   reference examples of the <function>amgetbatch</function> interface.
+  </para>
+
+  <para>
+   The same caveats described for <function>amgettuple</function> apply here
+   too: an entry in the returned batch means only that the index contains
+   an entry that matches the scan keys, not that the tuple necessarily still
+   exists in the heap or will pass the caller's snapshot test.
+  </para>
+
+  <para>
+   Index access methods using <function>amgetbatch</function> must set
+   <literal>scan-&gt;xs_recheck</literal> to indicate whether rechecking of
+   scan keys is required, in the same way as <function>amgettuple</function>
+   does. However, <literal>scan-&gt;xs_recheck</literal> must be set consistently
+   for an entire scan rather than varying on a per-tuple basis. This is a key
+   difference from <function>amgettuple</function>, which can set
+   <literal>scan-&gt;xs_recheck</literal> independently for each tuple it returns.
+   Index access methods that require granular control over
+   <literal>scan-&gt;xs_recheck</literal> must use the <function>amgettuple</function>
+   interface instead of <function>amgetbatch</function>.
+  </para>
+
+  <para>
+   Similarly, the <function>amgetbatch</function> interface does not support
+   index-only scans that return data in the form of a
+   <structname>HeapTuple</structname> pointer.  Index-only scans work by
+   copying <structname>IndexTuple</structname> records from index pages into a
+   local buffer associated with each batch.  <literal>xs_itupdesc</literal>
+   works in the same as already described for <function>amgettuple</function>.
+   The access method must not set the <literal>scan-&gt;xs_itup</literal> or
+   <literal>scan-&gt;xs_hitup</literal> fields itself.
+  </para>
+
+  <para>
+   The index access method must provide either <function>amgettuple</function>
+   or <function>amgetbatch</function>, but not both.
+   When the access method provides <function>amgetbatch</function>, it must
+   also provide <function>amfreebatch</function>.
+  </para>
+
+  <para>
+   The <function>amgetbatch</function> function need only be provided if the
+   access method supports <quote>plain</quote> index scans.  If it doesn't,
+   the <function>amgetbatch</function> field in its
+   <structname>IndexAmRoutine</structname> struct must be set to NULL.
+  </para>
+
+  <para>
+<programlisting>
+void
+amfreebatch (IndexScanDesc scan,
+             IndexScanBatch batch);
+</programlisting>
+   Frees a batch returned by the <function>amgetbatch</function> callback.
+   The <literal>batch</literal> argument is associated with an index page,
+   which will never be locked or pinned when <function>amfreebatch</function>
+   is called.
+  </para>
+
+  <para>
+   <function>amfreebatch</function> frees memory and resources associated with
+   the batch.  It must always release the caller's batch last, by passing it
+   as an argument to <function>indexam_util_batch_release</function>.
+  </para>
+
+  <para>
+   This function is called exclusively by table access methods (via the
+   <function>tableam_util_free_batch</function> wrapper function) to indicate
+   that processing of the batch is complete; it should never be called within
+   the index access method itself.  The table AM is responsible for releasing
+   the batch's buffer pin before calling <function>amfreebatch</function>.
+  </para>
+
+  <para>
+   The index AM has the option of setting <literal>LP_DEAD</literal> bits in
+   the index page to mark dead tuples.  While this is optional, implementing
+   it is recommended for performance, as it allows future scans to skip
+   known-dead index entries.  Both core index access methods that currently
+   support <function>amgetbatch</function> (B-tree and hash) implement
+   <literal>LP_DEAD</literal> marking, though third-party index access methods
+   are free to choose whether to implement this feature.
+   The table AM may call
+   <function>tableam_util_kill_scanpositem</function> to mark dead items as
+   the scan progresses. If the batch contains any such dead items, the batch's
+   <structfield>killedItems</structfield> array will have been sorted and
+   deduplicated before <function>amfreebatch</function> is called, with item
+   offsets appearing in ascending order (that is, in index page order, which
+   is also batch order) and no offset appearing more than once. This sorting
+   makes it unnecessary for the table AM to call
+   <function>tableam_util_kill_scanpositem</function> in any particular order.
+   (Index access methods using <function>amgettuple</function> rely on the
+   <structfield>kill_prior_tuple</structfield> mechanism instead to mark dead
+   tuples; the <filename>src/backend/access/gist/</filename> implementation
+   provides a reference example.)
+  </para>
+
+  <para>
+   The index AM may choose to retain its own buffer pins when this serves an
+   internal purpose (for example, maintaining a descent stack of pinned index
+   pages for reuse across <function>amgetbatch</function> calls).  However,
+   any scheme that retains buffer pins managed by the index AM must be sure to
+   free the pins at an opportune point (for example when <function>amrescan</function>
+   and/or <function>amendscan</function> are called).  It must also keep the
+   number of retained pins fixed and small, to avoid exhausting the backend's
+   buffer pin limit.
+  </para>
+
+  <para>
+   The <function>amfreebatch</function> function need only be provided if the
+   access method provides <function>amgetbatch</function>. Otherwise it has to
+   remain set to <literal>NULL</literal>.
+  </para>
 
   <para>
 <programlisting>
@@ -768,8 +945,8 @@ amgetbitmap (IndexScanDesc scan,
    itself, and therefore callers recheck both the scan conditions and the
    partial index predicate (if any) for recheckable tuples.  That might not
    always be true, however.
-   <function>amgetbitmap</function> and
-   <function>amgettuple</function> cannot be used in the same index scan; there
+   Only one of <function>amgetbitmap</function>, <function>amgettuple</function>,
+   or <function>amgetbatch</function> can be used in any given index scan; there
    are other restrictions too when using <function>amgetbitmap</function>, as explained
    in <xref linkend="index-scanning"/>.
   </para>
@@ -795,32 +972,52 @@ amendscan (IndexScanDesc scan);
   <para>
 <programlisting>
 void
-ammarkpos (IndexScanDesc scan);
+amposreset (IndexScanDesc scan,
+            IndexScanBatch batch);
 </programlisting>
-   Mark current scan position.  The access method need only support one
-   remembered scan position per scan.
+   Notify the index AM that the table AM is about to change the scan's
+   logical position in a way that requires the index AM to reset any state
+   that independently tracks the scan's progress.  For example, nbtree must
+   reset the array keys used by <literal>ScalarArrayOpExpr</literal> qual
+   evaluation when the scan position changes.  This callback is invoked when
+   the table AM is about to process a batch in a different direction than
+   was used when the batch was originally returned by
+   <function>amgetbatch</function>, and also when a marked scan position is
+   about to be restored.
   </para>
 
   <para>
-   The <function>ammarkpos</function> function need only be provided if the access
-   method supports ordered scans.  If it doesn't,
-   the <structfield>ammarkpos</structfield> field in its <structname>IndexAmRoutine</structname>
-   struct may be set to NULL.
+   When <function>amposreset</function> is called due to a cross-batch
+   direction change, the core system will have already flipped the batch's
+   <structfield>dir</structfield> field to reflect the new scan direction
+   before making the call.  The index AM should use this updated direction
+   when resetting any state that depends on knowing which way the scan is
+   proceeding.  When called to restore a marked position, the batch's
+   <structfield>dir</structfield> is not modified; it retains the direction
+   from when the batch was originally returned.  In both cases, the batch
+   passed to <function>amposreset</function> is the batch that will be used
+   to continue the scan.
   </para>
 
   <para>
-<programlisting>
-void
-amrestrpos (IndexScanDesc scan);
-</programlisting>
-   Restore the scan to the most recently marked position.
+   Index access methods that have private state which must be reset when the
+   scan position changes must provide an <function>amposreset</function>
+   implementation.  Index AMs with no such state may set
+   <structfield>amposreset</structfield> to NULL.
+   The <function>amposreset</function> function can only be provided when the
+   access method supports ordered scans through the <function>amgetbatch</function>
+   interface.
   </para>
 
   <para>
-   The <function>amrestrpos</function> function need only be provided if the access
-   method supports ordered scans.  If it doesn't,
-   the <structfield>amrestrpos</structfield> field in its <structname>IndexAmRoutine</structname>
-   struct may be set to NULL.
+   All index AMs that provide the <function>amgetbatch</function> interface
+   and set <structfield>amcanbackward</structfield> to true need to be
+   prepared for scans that request a batch in the opposite scan direction to
+   the one initially used to return the batch in question.  Likewise, mark and
+   restore functionality is supported by all index access methods that provide
+   the <function>amgetbatch</function> interface.  It is up to index AMs to
+   adequately account for how either case might invalidate their private scan
+   state.
   </para>
 
   <para>
@@ -975,6 +1172,8 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype);
        Access methods that always return entries in the natural ordering
        of their data (such as btree) should set
        <structfield>amcanorder</structfield> to true.
+       Both <function>amgettuple</function> and <function>amgetbatch</function>
+       scans support this capability.
        Currently, such access methods must use btree-compatible strategy
        numbers for their equality and ordering operators.
       </para>
@@ -987,41 +1186,49 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype);
        an order satisfying <literal>ORDER BY</literal> <replaceable>index_key</replaceable>
        <replaceable>operator</replaceable> <replaceable>constant</replaceable>.  Scan modifiers
        of that form can be passed to <function>amrescan</function> as described
-       previously.
+       previously.  Note that <function>amgetbatch</function> scans do not
+       support ordering operators.
       </para>
      </listitem>
     </itemizedlist>
   </para>
 
   <para>
-   The <function>amgettuple</function> function has a <literal>direction</literal> argument,
+   The <function>amgettuple</function> and <function>amgetbatch</function>
+   functions have a <literal>direction</literal> argument,
    which can be either <literal>ForwardScanDirection</literal> (the normal case)
    or  <literal>BackwardScanDirection</literal>.  If the first call after
    <function>amrescan</function> specifies <literal>BackwardScanDirection</literal>, then the
    set of matching index entries is to be scanned back-to-front rather than in
-   the normal front-to-back direction, so <function>amgettuple</function> must return
-   the last matching tuple in the index, rather than the first one as it
-   normally would.  (This will only occur for access
-   methods that set <structfield>amcanorder</structfield> to true.)  After the
-   first call, <function>amgettuple</function> must be prepared to advance the scan in
+   the normal front-to-back direction.  In this case,
+   <function>amgettuple</function> must return the last matching tuple in the
+   index, rather than the first one as it normally would.  Similarly,
+   <function>amgetbatch</function> must return the last matching batch of items
+   when either the first call after <function>amrescan</function> specifies
+   <literal>BackwardScanDirection</literal>, or a subsequent call has
+   <literal>NULL</literal> as its <structfield>priorbatch</structfield> argument
+   (indicating a backward scan restart).  (This backward-scan behavior will
+   only occur for access methods that set <structfield>amcanorder</structfield>
+   to true.)  After the first call, both <function>amgettuple</function> and
+   <function>amgetbatch</function> must be prepared to advance the scan in
    either direction from the most recently returned entry.  (But if
    <structfield>amcanbackward</structfield> is false, all subsequent
    calls will have the same direction as the first one.)
   </para>
 
   <para>
-   Access methods that support ordered scans must support <quote>marking</quote> a
-   position in a scan and later returning to the marked position.  The same
-   position might be restored multiple times.  However, only one position need
-   be remembered per scan; a new <function>ammarkpos</function> call overrides the
-   previously marked position.  An access method that does not support ordered
-   scans need not provide <function>ammarkpos</function> and <function>amrestrpos</function>
-   functions in <structname>IndexAmRoutine</structname>; set those pointers to NULL
-   instead.
+   Access methods using the <function>amgetbatch</function> interface
+   support <quote>marking</quote> a position in a scan and later returning to
+   the marked position.  When a batch is processed in a different direction
+   than it was originally fetched, or when a marked position is restored, the
+   index AM is notified via the <function>amposreset</function> callback (if
+   provided) so it can reset any private state that independently tracks the
+   scan's progress (such as array key state).  See the
+   <function>amposreset</function> function description for details.
   </para>
 
   <para>
-   Both the scan position and the mark position (if any) must be maintained
+   The scan position (if any) must be maintained by the table AM and index AM
    consistently in the face of concurrent insertions or deletions in the
    index.  It is OK if a freshly-inserted entry is not returned by a scan that
    would have found the entry if it had existed when the scan started, or for
@@ -1044,12 +1251,14 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype);
   </para>
 
   <para>
-   Instead of using <function>amgettuple</function>, an index scan can be done with
-   <function>amgetbitmap</function> to fetch all tuples in one call.  This can be
-   noticeably more efficient than <function>amgettuple</function> because it allows
-   avoiding lock/unlock cycles within the access method.  In principle
-   <function>amgetbitmap</function> should have the same effects as repeated
-   <function>amgettuple</function> calls, but we impose several restrictions to
+   Instead of using <function>amgettuple</function> or
+   <function>amgetbatch</function>, an index scan can be done with
+   <function>amgetbitmap</function> to fetch all tuples in one call.  This can
+   be noticeably more efficient than with an <quote>ordered</quote> scan
+   because it allows efficient sequential access to table AM pages containing
+   matches.  In principle <function>amgetbitmap</function> should have the
+   same effects as repeated <function>amgettuple</function> or
+   <function>amgetbatch</function> calls, but we impose several restrictions to
    simplify matters.  First of all, <function>amgetbitmap</function> returns all
    tuples at once and marking or restoring scan positions isn't
    supported. Secondly, the tuples are returned in a bitmap which doesn't
@@ -1066,10 +1275,62 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype);
 
   <para>
    Note that it is permitted for an access method to implement only
-   <function>amgetbitmap</function> and not <function>amgettuple</function>, or vice versa,
-   if its internal implementation is unsuited to one API or the other.
+   <function>amgetbitmap</function> and not <function>amgettuple</function>/<function>amgetbatch</function>,
+   or vice versa, if its internal implementation is unsuited to one API or the other.
   </para>
 
+  <sect2 id="index-scanning-batches">
+   <title>Table AM Considerations for Batch Scanning</title>
+
+   <para>
+    This section is primarily relevant to
+    <link linkend="tableam">table access method</link> authors.
+    When an index scan uses the <function>amgetbatch</function> interface,
+    the table AM is responsible for managing position state within
+    <structfield>scan-&gt;batchringbuf</structfield> and for controlling when
+    buffer pins on index pages are released.
+   </para>
+
+   <para>
+    The <structfield>scan-&gt;batchringbuf.scanPos</structfield> field tracks
+    the current read position within the ring buffer of batches.  The table AM
+    must advance <structfield>scanPos</structfield> as tuples are returned by
+    <function>table_index_getnext_slot</function>.  The core code may also
+    modify this field during operations such as mark/restore.
+   </para>
+
+   <para>
+    The <structfield>scan-&gt;batchringbuf.prefetchPos</structfield> field
+    tracks the position for I/O prefetching.  It is generally advanced by
+    initializing it from <structfield>scanPos</structfield> within a read
+    stream callback, allowing the table AM to prefetch table blocks pointed to
+    by items that are well ahead of the current scan position.  Initially
+    <structfield>prefetchPos</structfield> starts at
+    <structfield>scanPos</structfield>, but as the read stream ramps up it can
+    get far ahead &mdash; spanning multiple index pages if necessary to
+    maintain an optimal I/O prefetch distance for table block reads.  A major
+    goal of the <function>amgetbatch</function> interface is to allow the
+    table AM to prefetch without being limited to items from the current
+    <structfield>scanPos</structfield> index leaf page.
+   </para>
+
+   <para>
+    Both <structfield>scanPos</structfield> and
+    <structfield>prefetchPos</structfield> are controlled by the table AM and
+    core code; index access methods should not access or manipulate these
+    fields.  See the <filename>src/backend/access/heap/</filename>
+    implementation for a reference example.
+   </para>
+
+   <para>
+    Buffer pins on index pages returned by <function>amgetbatch</function> are
+    managed by the table AM.  See the <function>amgetbatch</function> and
+    <function>amfreebatch</function> descriptions in <xref linkend="index-functions"/>
+    for details.
+   </para>
+
+  </sect2>
+
  </sect1>
 
  <sect1 id="index-locking">
@@ -1123,11 +1384,13 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype);
      </listitem>
      <listitem>
       <para>
-       An index scan must maintain a pin
-       on the index page holding the item last returned by
-       <function>amgettuple</function>, and <function>ambulkdelete</function> cannot delete
-       entries from pages that are pinned by other backends.  The need
-       for this rule is explained below.
+       For <function>amgettuple</function> scans, the index access method must
+       maintain a pin on the index page holding the item last returned, and
+       <function>ambulkdelete</function> cannot delete entries from pages that
+       are pinned by other backends.  For <function>amgetbatch</function> scans,
+       the table access method controls when pins are dropped (see
+       <xref linkend="index-scanning-batches"/>).  The need for this rule is
+       explained below.
       </para>
      </listitem>
     </itemizedlist>
@@ -1173,6 +1436,31 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype);
    it is only safe to use such scans with MVCC-compliant snapshots.
   </para>
 
+  <para>
+   The <function>amgetbatch</function> interface provides a different approach:
+   the table access method receives batches of TIDs and controls when index
+   page pins are dropped.  Because <function>amgetbatch</function> reads
+   multiple index leaf pages ahead to facilitate I/O prefetching of table
+   blocks, it cannot practically hold pins on all those pages simultaneously.
+   Therefore, like <function>amgetbitmap</function>, I/O prefetching with
+   <function>amgetbatch</function> is only possible when an MVCC-compliant
+   snapshot is in use.  In practice, the heap table AM (and any table AM
+   with similar concurrency rules) usually drops pins eagerly for MVCC
+   snapshot scans, but retains pins for non-MVCC snapshot scans.  Index-only
+   scans may retain pins in some cases, while plain index scans that use an
+   MVCC snapshot always drop their pins eagerly.  Index access methods that
+   implement <function>amgetbatch</function> do not control when pins are
+   dropped; that decision is delegated to the table AM.
+  </para>
+
+  <para>
+   Index access methods that use <function>amgettuple</function> must implement
+   the pin-holding behavior themselves.  Such index AMs are expected to hold
+   onto the leaf page buffer pin for non-MVCC snapshot scans, replicating
+   the behavior that the heap table AM would use with
+   <function>amgetbatch</function>.
+  </para>
+
   <para>
    When the <structfield>ampredlocks</structfield> flag is not set, any scan using that
    index access method within a serializable transaction will acquire a
diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index 77c5a763d..55b7222e9 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -1152,12 +1152,13 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM
      </para>
 
      <para>
-      The access method must support <literal>amgettuple</literal> (see <xref
-      linkend="indexam"/>); at present this means <acronym>GIN</acronym>
-      cannot be used.  Although it's allowed, there is little point in using
-      B-tree or hash indexes with an exclusion constraint, because this
-      does nothing that an ordinary unique constraint doesn't do better.
-      So in practice the access method will always be <acronym>GiST</acronym> or
+      The access method must support either <literal>amgettuple</literal>
+      or <literal>amgetbatch</literal> (see <xref linkend="indexam"/>); at
+      present this means <acronym>GIN</acronym> cannot be used.  Although
+      it's allowed, there is little point in using B-tree or hash indexes
+      with an exclusion constraint, because this does nothing that an
+      ordinary unique constraint doesn't do better.  So in practice the
+      access method will always be <acronym>GiST</acronym> or
       <acronym>SP-GiST</acronym>.
      </para>
 
diff --git a/doc/src/sgml/tableam.sgml b/doc/src/sgml/tableam.sgml
index 9ccf5b739..8e70a6196 100644
--- a/doc/src/sgml/tableam.sgml
+++ b/doc/src/sgml/tableam.sgml
@@ -129,6 +129,14 @@ my_tableam_handler(PG_FUNCTION_ARGS)
   optional), the block number needs to provide locality.
  </para>
 
+ <para>
+  Table access methods can support ordered index scans using the
+  <function>amgetbatch</function> interface. See also
+  <xref linkend="index-scanning-batches"/> for details on interfacing with
+  <function>amgetbatch</function> index access methods, and managing the
+  scan's position.
+ </para>
+
  <para>
   For crash safety, an AM can use postgres' <link
   linkend="wal"><acronym>WAL</acronym></link>, or a custom implementation.
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 3dd63fd88..b5628736b 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -159,6 +159,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexscan_prefetch      | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(25 rows)
+(26 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
-- 
2.51.0