From ecba8752d060f19f43ed3297af5b8314e26a7767 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sat, 7 Mar 2026 11:46:19 -0800 Subject: [PATCH v0 1/2] Index scans: Show table buffer accesses separately in EXPLAIN ANALYZE This sets up a separate instrumentation stack that is used whilst an Index Scan or Index Only Scan does scanning on the table, for example due to additional data being needed. EXPLAIN ANALYZE will now show "Table Buffers" that represent such activity. The activity is also included in regular "Buffers" together with index activity and that of any child nodes. Author: Lukas Fittl Suggested-by: Andres Freund Reviewed-by: Zsolt Parragi (in an earlier version) Reviewed-by: Tomas Vondra (in an earlier version) Discussion: https://www.postgresql.org/message-id/flat/CAP53PkxrmpECzVFpeeEEHDGe6u625s%2BYkmVv5-gw3L_NDSfbiA%40mail.gmail.com#cb583a08e8e096aa1f093bb178906173 --- doc/src/sgml/perform.sgml | 13 ++++-- doc/src/sgml/ref/explain.sgml | 1 + src/backend/access/heap/heapam_indexscan.c | 16 ++++++-- src/backend/commands/explain.c | 48 ++++++++++++++++++---- src/backend/executor/execProcnode.c | 46 +++++++++++++++++++++ src/backend/executor/nodeBitmapIndexscan.c | 2 +- src/backend/executor/nodeIndexonlyscan.c | 24 ++++++++++- src/backend/executor/nodeIndexscan.c | 24 ++++++++++- src/include/executor/instrument_node.h | 5 +++ 9 files changed, 162 insertions(+), 17 deletions(-) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 604e8578a8d..d28f4f22535 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2; -> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10) Index Cond: (unique2 = t1.unique2) Index Searches: 10 + Table Buffers: shared hit=10 Buffers: shared hit=24 read=6 Planning: Buffers: shared hit=15 dirtied=9 @@ -1005,7 +1006,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; Index Cond: (f1 @> '((0.5,2))'::polygon) Rows Removed by Index Recheck: 1 Index Searches: 1 - Buffers: shared hit=1 + Table Buffers: shared hit=1 + Buffers: shared hit=2 Planning Time: 0.039 ms Execution Time: 0.098 ms @@ -1014,7 +1016,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; then rejected by a recheck of the index condition. This happens because a GiST index is lossy for polygon containment tests: it actually returns the rows with polygons that overlap the target, and then we have - to do the exact containment test on those rows. + to do the exact containment test on those rows. The Table Buffers + counts indicate how many operations were performed on the table instead of + the index. This number is included in the Buffers counts. @@ -1203,13 +1207,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000 QUERY PLAN -------------------------------------------------------------------&zwsp;------------------------------------------------------------ Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1) - Buffers: shared hit=16 + Buffers: shared hit=14 -> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1) Index Cond: (unique2 > 9000) Filter: (unique1 < 100) Rows Removed by Filter: 287 Index Searches: 1 - Buffers: shared hit=16 + Table Buffers: shared hit=11 + Buffers: shared hit=14 Planning Time: 0.077 ms Execution Time: 0.086 ms diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 5b8b521802e..71070736acb 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -509,6 +509,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200); -> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1) Index Cond: ((id > 100) AND (id < 200)) Index Searches: 1 + Table Buffers: shared hit=1 Buffers: shared hit=4 Planning Time: 0.244 ms Execution Time: 0.073 ms diff --git a/src/backend/access/heap/heapam_indexscan.c b/src/backend/access/heap/heapam_indexscan.c index fd70ad5bc2c..702419cbabf 100644 --- a/src/backend/access/heap/heapam_indexscan.c +++ b/src/backend/access/heap/heapam_indexscan.c @@ -572,11 +572,14 @@ heapam_index_fetch_tuple(Relation rel, static pg_attribute_always_inline bool heapam_index_fetch_heap_item(IndexScanDesc scan, IndexFetchHeapData *hscan, TupleTableSlot *slot, bool *heap_continue, - bool amgetbatch) + bool amgetbatch, Instrumentation *table_instr) { bool all_dead = false; bool found; + if (table_instr) + InstrPushStack(table_instr); + found = heapam_index_fetch_tuple(scan->heapRelation, hscan, &scan->xs_heaptid, scan->xs_snapshot, slot, @@ -607,6 +610,9 @@ heapam_index_fetch_heap_item(IndexScanDesc scan, IndexFetchHeapData *hscan, } } + if (table_instr) + InstrPopStack(table_instr); + return found; } @@ -1390,6 +1396,10 @@ heapam_index_getnext_slot(IndexScanDesc scan, ScanDirection direction, BlockNumber last_visited_block = InvalidBlockNumber; uint8 n_visited_pages = 0; ItemPointer tid = NULL; + Instrumentation *table_instr = NULL; + + if (scan->instrument && scan->instrument->table_instr.need_stack) + table_instr = &scan->instrument->table_instr; for (;;) { @@ -1434,7 +1444,7 @@ heapam_index_getnext_slot(IndexScanDesc scan, ScanDirection direction, scan->instrument->ntabletuplefetches++; if (!heapam_index_fetch_heap_item(scan, hscan, slot, - heap_continue, amgetbatch)) + heap_continue, amgetbatch, table_instr)) { /* * No visible tuple. If caller set a visited-pages limit @@ -1494,7 +1504,7 @@ heapam_index_getnext_slot(IndexScanDesc scan, ScanDirection direction, * next TID from the index. */ if (heapam_index_fetch_heap_item(scan, hscan, slot, heap_continue, - amgetbatch)) + amgetbatch, table_instr)) return true; } } diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index ae3258b3f5c..647be5d1286 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -144,7 +144,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -611,7 +611,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1028,7 +1028,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) if (es->buffers && peek_buffer_usage(es, &metrics->instr.bufusage)) { es->indent++; - show_buffer_usage(es, &metrics->instr.bufusage); + show_buffer_usage(es, &metrics->instr.bufusage, NULL); es->indent--; } } @@ -1042,7 +1042,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->instr.bufusage); + show_buffer_usage(es, &metrics->instr.bufusage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -2287,7 +2287,7 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->instr.bufusage); + show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL); if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->instr.walusage); @@ -2306,7 +2306,7 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->instr.bufusage); + show_buffer_usage(es, &instrument->instr.bufusage, NULL); if (es->wal) show_wal_usage(es, &instrument->instr.walusage); ExplainCloseWorker(n, es); @@ -3862,6 +3862,7 @@ show_indexscan_info(PlanState *planstate, ExplainState *es) { Plan *plan = planstate->plan; SharedIndexScanInstrumentation *SharedInfo = NULL; + Instrumentation *table_instr = NULL; uint64 nsearches = 0, ntabletuplefetches = 0; @@ -3877,6 +3878,7 @@ show_indexscan_info(PlanState *planstate, ExplainState *es) nsearches = indexstate->iss_Instrument->nsearches; SharedInfo = indexstate->iss_SharedInfo; + table_instr = &indexstate->iss_Instrument->table_instr; break; } case T_IndexOnlyScan: @@ -3886,6 +3888,7 @@ show_indexscan_info(PlanState *planstate, ExplainState *es) nsearches = indexstate->ioss_Instrument->nsearches; ntabletuplefetches = indexstate->ioss_Instrument->ntabletuplefetches; SharedInfo = indexstate->ioss_SharedInfo; + table_instr = &indexstate->ioss_Instrument->table_instr; break; } case T_BitmapIndexScan: @@ -3894,6 +3897,7 @@ show_indexscan_info(PlanState *planstate, ExplainState *es) nsearches = indexstate->biss_Instrument->nsearches; SharedInfo = indexstate->biss_SharedInfo; + table_instr = &indexstate->biss_Instrument->table_instr; break; } default: @@ -3916,6 +3920,9 @@ show_indexscan_info(PlanState *planstate, ExplainState *es) ExplainPropertyUInteger("Heap Fetches", NULL, ntabletuplefetches, es); ExplainPropertyUInteger("Index Searches", NULL, nsearches, es); + + if (es->buffers && table_instr) + show_buffer_usage(es, &table_instr->bufusage, "Table"); } /* @@ -4112,7 +4119,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4137,6 +4144,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4192,6 +4201,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4233,6 +4244,14 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { + char *buffers_title = NULL; + + if (title) + { + buffers_title = psprintf("%s Buffers", title); + ExplainOpenGroup(buffers_title, buffers_title, true, es); + } + ExplainPropertyInteger("Shared Hit Blocks", NULL, usage->shared_blks_hit, es); ExplainPropertyInteger("Shared Read Blocks", NULL, @@ -4253,8 +4272,20 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) usage->temp_blks_read, es); ExplainPropertyInteger("Temp Written Blocks", NULL, usage->temp_blks_written, es); + + if (buffers_title) + ExplainCloseGroup(buffers_title, buffers_title, true, es); + if (track_io_timing) { + char *timings_title = NULL; + + if (title) + { + timings_title = psprintf("%s I/O Timings", title); + ExplainOpenGroup(timings_title, timings_title, true, es); + } + ExplainPropertyFloat("Shared I/O Read Time", "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); @@ -4273,6 +4304,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) ExplainPropertyFloat("Temp I/O Write Time", "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); + + if (timings_title) + ExplainCloseGroup(timings_title, timings_title, true, es); } } } diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index ac400670fea..28f1f666a3b 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -847,6 +847,20 @@ ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker, &node->instrument->instr); + /* IndexScan/IndexOnlyScan have a separate entry to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + InstrFinalizeChild(&iss->iss_Instrument->table_instr, &node->instrument->instr); + } + else if (IsA(node, IndexOnlyScanState)) + { + IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node); + + InstrFinalizeChild(&ioss->ioss_Instrument->table_instr, &node->instrument->instr); + } + InstrFinalizeChild(&node->instrument->instr, parent); return false; @@ -892,6 +906,38 @@ ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context) num_workers = node->worker_instrument->num_workers; + /* + * Fold per-worker IndexScan/IndexOnlyScan table buffer stats into the + * per-worker node stats, matching what ExecFinalizeNodeInstrumentation + * does for the leader. + */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_SharedInfo) + { + int nworkers = Min(num_workers, iss->iss_SharedInfo->num_workers); + + for (int n = 0; n < nworkers; n++) + InstrAccumStack(&node->worker_instrument->instrument[n].instr, + &iss->iss_SharedInfo->winstrument[n].table_instr); + } + } + else if (IsA(node, IndexOnlyScanState)) + { + IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node); + + if (ioss->ioss_SharedInfo) + { + int nworkers = Min(num_workers, ioss->ioss_SharedInfo->num_workers); + + for (int n = 0; n < nworkers; n++) + InstrAccumStack(&node->worker_instrument->instrument[n].instr, + &ioss->ioss_SharedInfo->winstrument[n].table_instr); + } + } + /* Accumulate this node's per-worker stats to parent's per-worker stats */ if (parent && parent->worker_instrument) { diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index a9a3d2fb149..ff802b86446 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -277,7 +277,7 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags) /* Set up instrumentation of bitmap index scans if requested */ if (estate->es_instrument) - indexstate->biss_Instrument = palloc0_object(IndexScanInstrumentation); + indexstate->biss_Instrument = MemoryContextAllocZero(estate->es_query_instr->instr_cxt, sizeof(IndexScanInstrumentation)); /* Open the index relation. */ lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode; diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index b6b9dbd1075..9ff77a25a95 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -347,6 +347,7 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node) */ winstrument->nsearches += node->ioss_Instrument->nsearches; winstrument->ntabletuplefetches += node->ioss_Instrument->ntabletuplefetches; + InstrAccumStack(&winstrument->table_instr, &node->ioss_Instrument->table_instr); } /* @@ -521,7 +522,21 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) /* Set up instrumentation of index-only scans if requested */ if (estate->es_instrument) - indexstate->ioss_Instrument = palloc0_object(IndexScanInstrumentation); + { + indexstate->ioss_Instrument = MemoryContextAllocZero(estate->es_query_instr->instr_cxt, sizeof(IndexScanInstrumentation)); + + /* + * Track table and index access separately. We intentionally don't + * collect timing (even if enabled), since we don't need it, and + * IndexOnlyNext calls InstrPushStack / InstrPopStack (instead of the + * full InstrNode*) to reduce overhead. + */ + if ((estate->es_instrument & INSTRUMENT_BUFFERS) != 0) + { + InstrInitOptions(&indexstate->ioss_Instrument->table_instr, INSTRUMENT_BUFFERS); + InstrQueryRememberChild(estate->es_query_instr, &indexstate->ioss_Instrument->table_instr); + } + } /* Open the index relation. */ lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode; @@ -811,4 +826,11 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->ioss_SharedInfo = palloc(size); memcpy(node->ioss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's entry */ + for (int i = 0; i < node->ioss_SharedInfo->num_workers; i++) + { + InstrAccumStack(&node->ioss_Instrument->table_instr, + &node->ioss_SharedInfo->winstrument[i].table_instr); + } } diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 2ac854da468..d3ae4d016c4 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -821,6 +821,7 @@ ExecEndIndexScan(IndexScanState *node) */ winstrument->nsearches += node->iss_Instrument->nsearches; Assert(node->iss_Instrument->ntabletuplefetches == 0); + InstrAccumStack(&winstrument->table_instr, &node->iss_Instrument->table_instr); } /* @@ -983,7 +984,21 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) /* Set up instrumentation of index scans if requested */ if (estate->es_instrument) - indexstate->iss_Instrument = palloc0_object(IndexScanInstrumentation); + { + indexstate->iss_Instrument = MemoryContextAllocZero(estate->es_query_instr->instr_cxt, sizeof(IndexScanInstrumentation)); + + /* + * Track table and index access separately. We intentionally don't + * collect timing (even if enabled), since we don't need it, and + * IndexNext / IndexNextWithReorder call InstrPushStack / + * InstrPopStack (instead of the full InstrNode*) to reduce overhead. + */ + if ((estate->es_instrument & INSTRUMENT_BUFFERS) != 0) + { + InstrInitOptions(&indexstate->iss_Instrument->table_instr, INSTRUMENT_BUFFERS); + InstrQueryRememberChild(estate->es_query_instr, &indexstate->iss_Instrument->table_instr); + } + } /* Open the index relation. */ lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode; @@ -1839,4 +1854,11 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->iss_SharedInfo = palloc(size); memcpy(node->iss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's entry */ + for (int i = 0; i < node->iss_SharedInfo->num_workers; i++) + { + InstrAccumStack(&node->iss_Instrument->table_instr, + &node->iss_SharedInfo->winstrument[i].table_instr); + } } diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 78f810aabaf..bf0c4416dae 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -18,6 +18,8 @@ #ifndef INSTRUMENT_NODE_H #define INSTRUMENT_NODE_H +#include "executor/instrument.h" + /* --------------------- * Instrumentation information for aggregate function execution @@ -51,6 +53,9 @@ typedef struct IndexScanInstrumentation /* Table tuples fetched count (incremented during index-only scans) */ uint64 ntabletuplefetches; + + /* Instrumentation utilized for tracking buffer usage during table access */ + Instrumentation table_instr; } IndexScanInstrumentation; /* -- 2.47.1