From 29b06356ecf9e4ee77bc1267071c43aec9274404 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 22 Jan 2026 13:07:13 -0500 Subject: [PATCH v13 09/19] Use ExecSetTupleBound hint during index scans. This gives index scans a way to avoid using a read stream during certain kinds of queries that are very unlikely to benefit from prefetching: queries whose plan involves a LIMIT node that is consumes tuples from an index scan (or index-only scan) node. Testing has shown this to be particularly important with nested loop joins with a LIMIT on an inner index scan. This is typical of nested loop anti-joins, and nested loop semi-joins. XXX This is still very much a WIP. Author: Peter Geoghegan Reviewed-By: Tomas Vondra --- src/include/access/relscan.h | 2 + src/include/nodes/execnodes.h | 4 ++ src/backend/access/heap/heapam_handler.c | 5 +++ src/backend/access/index/genam.c | 1 + src/backend/executor/execProcnode.c | 50 ++++++++++++++++++++++++ src/backend/executor/nodeIndexonlyscan.c | 10 +++++ src/backend/executor/nodeIndexscan.c | 13 ++++++ 7 files changed, 85 insertions(+) diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 14782b599..72d517487 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -396,6 +396,8 @@ typedef struct IndexScanDescData /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; + int64 tuples_needed; + /* * Counter to request early abort during get_actual_variable_range scans. * When nonzero, the scan will read at most this many leaf pages before diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 7c1b427fb..12dd533c3 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1697,6 +1697,7 @@ typedef struct * ScanDesc index scan descriptor * Instrument local index scan instrumentation * SharedInfo parallel worker instrumentation (no leader entry) + * TuplesNeeded tuple bound, see ExecSetTupleBound * * ReorderQueue tuples that need reordering due to re-check * ReachedEnd have we fetched all tuples from index already? @@ -1725,6 +1726,7 @@ typedef struct IndexScanState struct IndexScanDescData *iss_ScanDesc; IndexScanInstrumentation *iss_Instrument; SharedIndexScanInstrumentation *iss_SharedInfo; + int64 iss_TuplesNeeded; /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; @@ -1753,6 +1755,7 @@ typedef struct IndexScanState * ScanDesc index scan descriptor * Instrument local index scan instrumentation * SharedInfo parallel worker instrumentation (no leader entry) + * TuplesNeeded tuple bound, see ExecSetTupleBound * TableSlot slot for holding tuples fetched from the table * PscanLen size of parallel index-only scan descriptor * NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN @@ -1775,6 +1778,7 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; IndexScanInstrumentation *ioss_Instrument; SharedIndexScanInstrumentation *ioss_SharedInfo; + int64 ioss_TuplesNeeded; TupleTableSlot *ioss_TableSlot; Size ioss_PscanLen; AttrNumber *ioss_NameCStringAttNums; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 8d582e3d9..d2e84d51b 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -637,9 +637,14 @@ heapam_batch_getnext(IndexScanDesc scan, ScanDirection direction, * haven't done any heap fetches yet. We don't want to waste any * cycles on allocating a read stream until we have a demonstrated * need for perform heap fetches. + * + * Also avoiding prefetching when the core executor passes the scan a + * tuples_needed hint that indicates that the scan is likely to end + * before long. */ if (!hscan->xs_read_stream && priorBatch && scan->MVCCScan && hscan->xs_blk != InvalidBlockNumber && /* for index-only scans */ + (scan->tuples_needed == -1 || scan->tuples_needed > 20) && enable_indexscan_prefetch) { Assert(!batchringbuf->prefetchPos.valid); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index d50e3fa71..aa44a0ec2 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -126,6 +126,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->tuples_needed = -1; /* no limit */ scan->xs_read_extremal_only = 0; scan->batch_index_opaque_size = 0; diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 7e40b8525..aaed3506a 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -72,6 +72,7 @@ */ #include "postgres.h" +#include "access/relscan.h" #include "executor/executor.h" #include "executor/nodeAgg.h" #include "executor/nodeAppend.h" @@ -840,6 +841,12 @@ ExecShutdownNode_walker(PlanState *node, void *context) * Any negative tuples_needed value means "no limit", which should be the * default assumption when this is not called at all for a particular node. * + * Note: for nodes like Sort, tuples_needed is a hard limit -- the node can + * stop after producing exactly that many tuples. For index scans, however, + * tuples_needed is only an approximation, because non-index quals may filter + * out some tuples. The actual number of tuples fetched from the index may + * need to exceed tuples_needed to satisfy the caller's requirements. + * * Note: if this is called repeatedly on a plan tree, the exact same set * of nodes must be updated with the new limit each time; be careful that * only unchanging conditions are tested here. @@ -977,6 +984,49 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); } + else if (IsA(child_node, IndexScanState)) + { + /* + * If it is an IndexScan, save the tuples_needed in the state so it + * can be propagated to the IndexScanDesc when the scan is started. + * + * Note: As with Sort, the index scan node is responsible for reacting + * properly to changes to this parameter. + */ + IndexScanState *isstate = (IndexScanState *) child_node; + + isstate->iss_TuplesNeeded = tuples_needed; + + /* If scan already started, update the IndexScanDesc too */ + if (isstate->iss_ScanDesc) + isstate->iss_ScanDesc->tuples_needed = tuples_needed; + } + else if (IsA(child_node, IndexOnlyScanState)) + { + /* Same comments as for IndexScan */ + IndexOnlyScanState *iosstate = (IndexOnlyScanState *) child_node; + + iosstate->ioss_TuplesNeeded = tuples_needed; + + /* If scan already started, update the IndexScanDesc too */ + if (iosstate->ioss_ScanDesc) + iosstate->ioss_ScanDesc->tuples_needed = tuples_needed; + } + else if (IsA(child_node, NestLoopState)) + { + /* + * For NestLoop joins where each outer tuple produces at most one + * output tuple, we can propagate the bound to the outer child + */ + NestLoopState *nlstate = (NestLoopState *) child_node; + JoinType jointype = nlstate->js.jointype; + + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || + nlstate->js.single_match) + { + ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); + } + } /* * In principle we could descend through any plan node type that is diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 84bff60ce..f3fc63abe 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -96,6 +96,9 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_ScanDesc = scandesc; Assert(node->ioss_ScanDesc->xs_want_itup); + /* Pass down any tuple bound */ + scandesc->tuples_needed = node->ioss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -528,6 +531,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) indexstate->ioss_RuntimeKeysReady = false; indexstate->ioss_RuntimeKeys = NULL; indexstate->ioss_NumRuntimeKeys = 0; + indexstate->ioss_TuplesNeeded = -1; /* * build the index scan keys from the index qualification @@ -704,6 +708,9 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, piscan); Assert(node->ioss_ScanDesc->xs_want_itup); + /* Pass down any tuple bound */ + node->ioss_ScanDesc->tuples_needed = node->ioss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. @@ -769,6 +776,9 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, piscan); Assert(node->ioss_ScanDesc->xs_want_itup); + /* Pass down any tuple bound */ + node->ioss_ScanDesc->tuples_needed = node->ioss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 67822947a..3e174cb65 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -115,6 +115,9 @@ IndexNext(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* Pass down any tuple bound */ + scandesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -211,6 +214,9 @@ IndexNextWithReorder(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* Pass down any tuple bound */ + scandesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -986,6 +992,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->iss_RuntimeKeysReady = false; indexstate->iss_RuntimeKeys = NULL; indexstate->iss_NumRuntimeKeys = 0; + indexstate->iss_TuplesNeeded = -1; /* * build the index scan keys from the index qualification @@ -1730,6 +1737,9 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_NumOrderByKeys, piscan); + /* Pass down any tuple bound */ + node->iss_ScanDesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. @@ -1794,6 +1804,9 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_NumOrderByKeys, piscan); + /* Pass down any tuple bound */ + node->iss_ScanDesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. -- 2.53.0