From 91bddbb2c246bce7de9191cb9eb8d9b35217ac9b Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 22 Jan 2026 13:07:13 -0500 Subject: [PATCH v16 06/17] Use ExecSetTupleBound hint during index scans. This gives index scans a way to avoid using a read stream during certain kinds of queries that are very unlikely to benefit from prefetching: queries whose plan involves a LIMIT node that is consumes tuples from an index scan (or index-only scan) node. Testing has shown this to be particularly important with nested loop joins with a LIMIT on an inner index scan. This is typical of nested loop anti-joins, and nested loop semi-joins. XXX This is still very much a WIP. Author: Peter Geoghegan Reviewed-By: Tomas Vondra --- src/include/access/relscan.h | 2 + src/include/nodes/execnodes.h | 4 ++ src/backend/access/heap/heapam_handler.c | 5 +++ src/backend/access/index/genam.c | 1 + src/backend/executor/execProcnode.c | 50 ++++++++++++++++++++++++ src/backend/executor/nodeIndexonlyscan.c | 10 +++++ src/backend/executor/nodeIndexscan.c | 13 ++++++ 7 files changed, 85 insertions(+) diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 3eb41e707..4ac218921 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -386,6 +386,8 @@ typedef struct IndexScanDescData /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; + int64 tuples_needed; + /* * Limit on distinct heap pages visited before giving up (0 = no limit). * Used by selfuncs.c to bound the cost of get_actual_variable_endpoint(). diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 593a77108..25fda210f 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1706,6 +1706,7 @@ typedef struct * ScanDesc index scan descriptor * Instrument local index scan instrumentation * SharedInfo parallel worker instrumentation (no leader entry) + * TuplesNeeded tuple bound, see ExecSetTupleBound * * ReorderQueue tuples that need reordering due to re-check * ReachedEnd have we fetched all tuples from index already? @@ -1734,6 +1735,7 @@ typedef struct IndexScanState struct IndexScanDescData *iss_ScanDesc; IndexScanInstrumentation *iss_Instrument; SharedIndexScanInstrumentation *iss_SharedInfo; + int64 iss_TuplesNeeded; /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; @@ -1762,6 +1764,7 @@ typedef struct IndexScanState * ScanDesc index scan descriptor * Instrument local index scan instrumentation * SharedInfo parallel worker instrumentation (no leader entry) + * TuplesNeeded tuple bound, see ExecSetTupleBound * TableSlot slot for holding tuples fetched from the table * PscanLen size of parallel index-only scan descriptor * NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN @@ -1784,6 +1787,7 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; IndexScanInstrumentation *ioss_Instrument; SharedIndexScanInstrumentation *ioss_SharedInfo; + int64 ioss_TuplesNeeded; TupleTableSlot *ioss_TableSlot; Size ioss_PscanLen; AttrNumber *ioss_NameCStringAttNums; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 28a144d1c..bbc02b296 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -626,9 +626,14 @@ heapam_batch_getnext(IndexScanDesc scan, ScanDirection direction, * haven't done any heap fetches yet. We don't want to waste any * cycles on allocating a read stream until we have a demonstrated * need for perform heap fetches. + * + * Also avoiding prefetching when the core executor passes the scan a + * tuples_needed hint that indicates that the scan is likely to end + * before long. */ if (!hscan->xs_read_stream && priorBatch && scan->MVCCScan && hscan->xs_blk != InvalidBlockNumber && /* for index-only scans */ + (scan->tuples_needed == -1 || scan->tuples_needed > 20) && enable_indexscan_prefetch) { Assert(!batchringbuf->prefetchPos.valid); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index d0b68f38e..dede16cf3 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -126,6 +126,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->tuples_needed = -1; /* no limit */ scan->xs_visited_pages_limit = 0; scan->batch_index_opaque_size = 0; diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index d35976925..b548e32f7 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -72,6 +72,7 @@ */ #include "postgres.h" +#include "access/relscan.h" #include "executor/executor.h" #include "executor/instrument.h" #include "executor/nodeAgg.h" @@ -841,6 +842,12 @@ ExecShutdownNode_walker(PlanState *node, void *context) * Any negative tuples_needed value means "no limit", which should be the * default assumption when this is not called at all for a particular node. * + * Note: for nodes like Sort, tuples_needed is a hard limit -- the node can + * stop after producing exactly that many tuples. For index scans, however, + * tuples_needed is only an approximation, because non-index quals may filter + * out some tuples. The actual number of tuples fetched from the index may + * need to exceed tuples_needed to satisfy the caller's requirements. + * * Note: if this is called repeatedly on a plan tree, the exact same set * of nodes must be updated with the new limit each time; be careful that * only unchanging conditions are tested here. @@ -978,6 +985,49 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); } + else if (IsA(child_node, IndexScanState)) + { + /* + * If it is an IndexScan, save the tuples_needed in the state so it + * can be propagated to the IndexScanDesc when the scan is started. + * + * Note: As with Sort, the index scan node is responsible for reacting + * properly to changes to this parameter. + */ + IndexScanState *isstate = (IndexScanState *) child_node; + + isstate->iss_TuplesNeeded = tuples_needed; + + /* If scan already started, update the IndexScanDesc too */ + if (isstate->iss_ScanDesc) + isstate->iss_ScanDesc->tuples_needed = tuples_needed; + } + else if (IsA(child_node, IndexOnlyScanState)) + { + /* Same comments as for IndexScan */ + IndexOnlyScanState *iosstate = (IndexOnlyScanState *) child_node; + + iosstate->ioss_TuplesNeeded = tuples_needed; + + /* If scan already started, update the IndexScanDesc too */ + if (iosstate->ioss_ScanDesc) + iosstate->ioss_ScanDesc->tuples_needed = tuples_needed; + } + else if (IsA(child_node, NestLoopState)) + { + /* + * For NestLoop joins where each outer tuple produces at most one + * output tuple, we can propagate the bound to the outer child + */ + NestLoopState *nlstate = (NestLoopState *) child_node; + JoinType jointype = nlstate->js.jointype; + + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || + nlstate->js.single_match) + { + ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); + } + } /* * In principle we could descend through any plan node type that is diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index aeb4bbc3e..94ea92db3 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -97,6 +97,9 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_ScanDesc = scandesc; Assert(node->ioss_ScanDesc->xs_want_itup); + /* Pass down any tuple bound */ + scandesc->tuples_needed = node->ioss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -531,6 +534,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) indexstate->ioss_RuntimeKeysReady = false; indexstate->ioss_RuntimeKeys = NULL; indexstate->ioss_NumRuntimeKeys = 0; + indexstate->ioss_TuplesNeeded = -1; /* * build the index scan keys from the index qualification @@ -707,6 +711,9 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, piscan); Assert(node->ioss_ScanDesc->xs_want_itup); + /* Pass down any tuple bound */ + node->ioss_ScanDesc->tuples_needed = node->ioss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. @@ -772,6 +779,9 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, piscan); Assert(node->ioss_ScanDesc->xs_want_itup); + /* Pass down any tuple bound */ + node->ioss_ScanDesc->tuples_needed = node->ioss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 10d85cfd9..468f9a0ce 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -117,6 +117,9 @@ IndexNext(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* Pass down any tuple bound */ + scandesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -213,6 +216,9 @@ IndexNextWithReorder(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* Pass down any tuple bound */ + scandesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -989,6 +995,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->iss_RuntimeKeysReady = false; indexstate->iss_RuntimeKeys = NULL; indexstate->iss_NumRuntimeKeys = 0; + indexstate->iss_TuplesNeeded = -1; /* * build the index scan keys from the index qualification @@ -1733,6 +1740,9 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_NumOrderByKeys, piscan); + /* Pass down any tuple bound */ + node->iss_ScanDesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. @@ -1797,6 +1807,9 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_NumOrderByKeys, piscan); + /* Pass down any tuple bound */ + node->iss_ScanDesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. -- 2.53.0