From b97e57174f45cfcde335684529b68d643f7506db Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 22 Jan 2026 13:07:13 -0500 Subject: [PATCH v11 08/12] Use ExecSetTupleBound hint during index scans. This gives index scans a way to avoid using a read stream during certain kinds of queries that are very unlikely to benefit from prefetching: queries whose plan involves a LIMIT node that is consumes tuples from an index scan (or index-only scan) node. Testing has shown this to be particularly important with nested loop joins with a LIMIT on an inner index scan. This is typical of nested loop anti-joins, and nested loop semi-joins. XXX This is still very much a WIP. Author: Peter Geoghegan Reviewed-By: Tomas Vondra --- src/include/access/relscan.h | 2 ++ src/include/nodes/execnodes.h | 4 +++ src/backend/access/heap/heapam_handler.c | 5 +++ src/backend/access/index/genam.c | 1 + src/backend/executor/execProcnode.c | 44 ++++++++++++++++++++++++ src/backend/executor/nodeIndexonlyscan.c | 10 ++++++ src/backend/executor/nodeIndexscan.c | 13 +++++++ 7 files changed, 79 insertions(+) diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 83c1c98fe..9a63a0c5b 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -386,6 +386,8 @@ typedef struct IndexScanDescData /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; + int64 tuples_needed; + /* * Flag to request early abort during get_actual_variable_range scans. * Such scans must end on the rightmost (or leftmost) index page, no diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 7c1b427fb..12dd533c3 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1697,6 +1697,7 @@ typedef struct * ScanDesc index scan descriptor * Instrument local index scan instrumentation * SharedInfo parallel worker instrumentation (no leader entry) + * TuplesNeeded tuple bound, see ExecSetTupleBound * * ReorderQueue tuples that need reordering due to re-check * ReachedEnd have we fetched all tuples from index already? @@ -1725,6 +1726,7 @@ typedef struct IndexScanState struct IndexScanDescData *iss_ScanDesc; IndexScanInstrumentation *iss_Instrument; SharedIndexScanInstrumentation *iss_SharedInfo; + int64 iss_TuplesNeeded; /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; @@ -1753,6 +1755,7 @@ typedef struct IndexScanState * ScanDesc index scan descriptor * Instrument local index scan instrumentation * SharedInfo parallel worker instrumentation (no leader entry) + * TuplesNeeded tuple bound, see ExecSetTupleBound * TableSlot slot for holding tuples fetched from the table * PscanLen size of parallel index-only scan descriptor * NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN @@ -1775,6 +1778,7 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; IndexScanInstrumentation *ioss_Instrument; SharedIndexScanInstrumentation *ioss_SharedInfo; + int64 ioss_TuplesNeeded; TupleTableSlot *ioss_TableSlot; Size ioss_PscanLen; AttrNumber *ioss_NameCStringAttNums; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index a491600fa..138d6be08 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -534,9 +534,14 @@ heapam_batch_getnext(IndexScanDesc scan, ScanDirection direction, * haven't done any heap fetches yet. We don't want to waste any * cycles on allocating a read stream until we have a demonstrated * need for perform heap fetches. + * + * Also avoiding prefetching when the core executor passes the scan a + * tuples_needed hint that indicates that the scan is likely to end + * before long. */ if (!hscan->xs_read_stream && priorBatch && scan->MVCCScan && hscan->xs_blk != InvalidBlockNumber && /* for index-only scans */ + (scan->tuples_needed == -1 || scan->tuples_needed > 20) && io_method != IOMETHOD_SYNC && enable_indexscan_prefetch) { Assert(!batchringbuf->prefetchPos.valid); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index e8644c608..b47f6f019 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -125,6 +125,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->tuples_needed = -1; /* no limit */ scan->xs_read_extremal_only = false; return scan; diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 7e40b8525..8ae8fbebe 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -72,6 +72,7 @@ */ #include "postgres.h" +#include "access/relscan.h" #include "executor/executor.h" #include "executor/nodeAgg.h" #include "executor/nodeAppend.h" @@ -977,6 +978,49 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); } + else if (IsA(child_node, IndexScanState)) + { + /* + * If it is an IndexScan, save the tuples_needed in the state so it + * can be propagated to the IndexScanDesc when the scan is started. + * + * Note: As with Sort, the index scan node is responsible for reacting + * properly to changes to this parameter. + */ + IndexScanState *isstate = (IndexScanState *) child_node; + + isstate->iss_TuplesNeeded = tuples_needed; + + /* If scan already started, update the IndexScanDesc too */ + if (isstate->iss_ScanDesc) + isstate->iss_ScanDesc->tuples_needed = tuples_needed; + } + else if (IsA(child_node, IndexOnlyScanState)) + { + /* Same comments as for IndexScan */ + IndexOnlyScanState *iosstate = (IndexOnlyScanState *) child_node; + + iosstate->ioss_TuplesNeeded = tuples_needed; + + /* If scan already started, update the IndexScanDesc too */ + if (iosstate->ioss_ScanDesc) + iosstate->ioss_ScanDesc->tuples_needed = tuples_needed; + } + else if (IsA(child_node, NestLoopState)) + { + /* + * For NestLoop joins where each outer tuple produces at most one + * output tuple, we can propagate the bound to the outer child + */ + NestLoopState *nlstate = (NestLoopState *) child_node; + JoinType jointype = nlstate->js.jointype; + + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || + nlstate->js.single_match) + { + ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); + } + } /* * In principle we could descend through any plan node type that is diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 84bff60ce..f3fc63abe 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -96,6 +96,9 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_ScanDesc = scandesc; Assert(node->ioss_ScanDesc->xs_want_itup); + /* Pass down any tuple bound */ + scandesc->tuples_needed = node->ioss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -528,6 +531,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) indexstate->ioss_RuntimeKeysReady = false; indexstate->ioss_RuntimeKeys = NULL; indexstate->ioss_NumRuntimeKeys = 0; + indexstate->ioss_TuplesNeeded = -1; /* * build the index scan keys from the index qualification @@ -704,6 +708,9 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, piscan); Assert(node->ioss_ScanDesc->xs_want_itup); + /* Pass down any tuple bound */ + node->ioss_ScanDesc->tuples_needed = node->ioss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. @@ -769,6 +776,9 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, piscan); Assert(node->ioss_ScanDesc->xs_want_itup); + /* Pass down any tuple bound */ + node->ioss_ScanDesc->tuples_needed = node->ioss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 67822947a..3e174cb65 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -115,6 +115,9 @@ IndexNext(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* Pass down any tuple bound */ + scandesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -211,6 +214,9 @@ IndexNextWithReorder(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* Pass down any tuple bound */ + scandesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -986,6 +992,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->iss_RuntimeKeysReady = false; indexstate->iss_RuntimeKeys = NULL; indexstate->iss_NumRuntimeKeys = 0; + indexstate->iss_TuplesNeeded = -1; /* * build the index scan keys from the index qualification @@ -1730,6 +1737,9 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_NumOrderByKeys, piscan); + /* Pass down any tuple bound */ + node->iss_ScanDesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. @@ -1794,6 +1804,9 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_NumOrderByKeys, piscan); + /* Pass down any tuple bound */ + node->iss_ScanDesc->tuples_needed = node->iss_TuplesNeeded; + /* * If no run-time keys to calculate or they are ready, go ahead and pass * the scankeys to the index AM. -- 2.51.0