From 61d4014bc4366314b41615e60e3b8c5ab855b44f Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sat, 7 Mar 2026 11:46:19 -0800 Subject: [PATCH v9 8/9] Index scans: Show table buffer accesses separately in EXPLAIN ANALYZE This sets up a separate instrumentation stack that is used whilst an Index Scan or Index Only Scan does scanning on the table, for example due to additional data being needed. EXPLAIN ANALYZE will now show "Table Buffers" that represent such activity. The activity is also included in regular "Buffers" together with index activity and that of any child nodes. Author: Lukas Fittl Suggested-by: Andres Freund Reviewed-by: Zsolt Parragi Reviewed-by: Tomas Vondra Discussion: https://www.postgresql.org/message-id/flat/CAP53PkxrmpECzVFpeeEEHDGe6u625s%2BYkmVv5-gw3L_NDSfbiA%40mail.gmail.com#cb583a08e8e096aa1f093bb178906173 --- doc/src/sgml/perform.sgml | 13 ++- doc/src/sgml/ref/explain.sgml | 1 + src/backend/commands/explain.c | 47 ++++++++-- src/backend/executor/execProcnode.c | 57 ++++++++++++ src/backend/executor/nodeIndexonlyscan.c | 25 +++++- src/backend/executor/nodeIndexscan.c | 107 ++++++++++++++++++----- src/include/executor/instrument_node.h | 5 ++ src/include/nodes/execnodes.h | 15 ++++ 8 files changed, 235 insertions(+), 35 deletions(-) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 5f6f1db0467..9219625faf6 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2; -> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10) Index Cond: (unique2 = t1.unique2) Index Searches: 10 + Table Buffers: shared hit=10 Buffers: shared hit=24 read=6 Planning: Buffers: shared hit=15 dirtied=9 @@ -949,7 +950,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; Index Cond: (f1 @> '((0.5,2))'::polygon) Rows Removed by Index Recheck: 1 Index Searches: 1 - Buffers: shared hit=1 + Table Buffers: shared hit=1 + Buffers: shared hit=2 Planning Time: 0.039 ms Execution Time: 0.098 ms @@ -958,7 +960,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; then rejected by a recheck of the index condition. This happens because a GiST index is lossy for polygon containment tests: it actually returns the rows with polygons that overlap the target, and then we have - to do the exact containment test on those rows. + to do the exact containment test on those rows. The Table Buffers + counts indicate how many operations were performed on the table instead of + the index. This number is included in the Buffers counts. @@ -1147,13 +1151,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000 QUERY PLAN -------------------------------------------------------------------&zwsp;------------------------------------------------------------ Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1) - Buffers: shared hit=16 + Buffers: shared hit=14 -> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1) Index Cond: (unique2 > 9000) Filter: (unique1 < 100) Rows Removed by Filter: 287 Index Searches: 1 - Buffers: shared hit=16 + Table Buffers: shared hit=11 + Buffers: shared hit=14 Planning Time: 0.077 ms Execution Time: 0.086 ms diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 7dee77fd366..912c96f2ff5 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -506,6 +506,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200); -> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1) Index Cond: ((id > 100) AND (id < 200)) Index Searches: 1 + Table Buffers: shared hit=1 Buffers: shared hit=4 Planning Time: 0.244 ms Execution Time: 0.073 ms diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index fa250b0196a..8dd56394cf2 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -144,7 +144,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -605,7 +605,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1022,7 +1022,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage)) { es->indent++; - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); es->indent--; } } @@ -1036,7 +1036,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -1964,6 +1964,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexScanState *) planstate)->iss_InstrumentTable->instr.bufusage, "Table"); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -1981,6 +1984,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainPropertyFloat("Heap Fetches", NULL, planstate->instrument->ntuples2, 0, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexOnlyScanState *) planstate)->ioss_InstrumentTable->instr.bufusage, "Table"); break; case T_BitmapIndexScan: show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig, @@ -2282,7 +2288,7 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->instr.bufusage); + show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL); if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->instr.walusage); @@ -2301,7 +2307,7 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->instr.bufusage); + show_buffer_usage(es, &instrument->instr.bufusage, NULL); if (es->wal) show_wal_usage(es, &instrument->instr.walusage); ExplainCloseWorker(n, es); @@ -4101,7 +4107,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4126,6 +4132,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4181,6 +4189,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4222,6 +4232,14 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { + char *buffers_title = NULL; + + if (title) + { + buffers_title = psprintf("%s Buffers", title); + ExplainOpenGroup(buffers_title, buffers_title, true, es); + } + ExplainPropertyInteger("Shared Hit Blocks", NULL, usage->shared_blks_hit, es); ExplainPropertyInteger("Shared Read Blocks", NULL, @@ -4242,8 +4260,20 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) usage->temp_blks_read, es); ExplainPropertyInteger("Temp Written Blocks", NULL, usage->temp_blks_written, es); + + if (buffers_title) + ExplainCloseGroup(buffers_title, buffers_title, true, es); + if (track_io_timing) { + char *timings_title = NULL; + + if (title) + { + timings_title = psprintf("%s I/O Timings", title); + ExplainOpenGroup(timings_title, timings_title, true, es); + } + ExplainPropertyFloat("Shared I/O Read Time", "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); @@ -4262,6 +4292,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) ExplainPropertyFloat("Temp I/O Write Time", "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); + + if (timings_title) + ExplainCloseGroup(timings_title, timings_title, true, es); } } } diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 828a1fe3b1d..eeebe2ce64f 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -415,9 +415,32 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) + { result->instrument = InstrAllocNode(estate->es_instrument, result->async_capable); + /* + * IndexScan / IndexOnlyScan track table and index access separately. + * + * We intentionally don't collect timing for them (even if enabled), + * since we don't need it, and the executor nodes call InstrPushStack + * / InstrPopStack (instead of the full InstrNode*) to reduce + * overhead. + */ + if (IsA(result, IndexScanState) && (estate->es_instrument & INSTRUMENT_BUFFERS) != 0) + { + IndexScanState *iss = castNode(IndexScanState, result); + + iss->iss_InstrumentTable = InstrAllocNode(INSTRUMENT_BUFFERS, false); + } + else if (IsA(result, IndexOnlyScanState) && (estate->es_instrument & INSTRUMENT_BUFFERS) != 0) + { + IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, result); + + ioss->ioss_InstrumentTable = InstrAllocNode(INSTRUMENT_BUFFERS, false); + } + } + return result; } @@ -837,8 +860,26 @@ ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) return false; if (node->instrument) + { InstrQueryRememberNode(parent, node->instrument); + /* IndexScan/IndexOnlyScan have a separate entry to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTable) + InstrQueryRememberNode(parent, iss->iss_InstrumentTable); + } + else if (IsA(node, IndexOnlyScanState)) + { + IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node); + + if (ioss->ioss_InstrumentTable) + InstrQueryRememberNode(parent, ioss->ioss_InstrumentTable); + } + } + return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); } @@ -880,6 +921,22 @@ ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) if (!node->instrument) return false; + /* IndexScan/IndexOnlyScan have a separate entry to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTable) + iss->iss_InstrumentTable = InstrFinalizeNode(iss->iss_InstrumentTable, &node->instrument->instr); + } + else if (IsA(node, IndexOnlyScanState)) + { + IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node); + + if (ioss->ioss_InstrumentTable) + ioss->ioss_InstrumentTable = InstrFinalizeNode(ioss->ioss_InstrumentTable, &node->instrument->instr); + } + node->instrument = InstrFinalizeNode(node->instrument, parent); return false; diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index c8db357e69f..3f94671b55c 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -163,11 +163,22 @@ IndexOnlyNext(IndexOnlyScanState *node) ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { + bool found; + /* * Rats, we have to visit the heap to check visibility. */ InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) + + if (node->ioss_InstrumentTable) + InstrPushStack(&node->ioss_InstrumentTable->instr); + + found = index_fetch_heap(scandesc, node->ioss_TableSlot); + + if (node->ioss_InstrumentTable) + InstrPopStack(&node->ioss_InstrumentTable->instr); + + if (!found) continue; /* no visible tuple, try next index entry */ ExecClearTuple(node->ioss_TableSlot); @@ -434,6 +445,10 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->ioss_Instrument.nsearches; + if (node->ioss_InstrumentTable) + { + InstrAccumStack(&winstrument->worker_table_instr, &node->ioss_InstrumentTable->instr); + } } /* @@ -889,4 +904,12 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->ioss_SharedInfo = palloc(size); memcpy(node->ioss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's entry */ + if (node->ioss_InstrumentTable) + for (int i = 0; i < node->ioss_SharedInfo->num_workers; i++) + { + InstrAccumStack(&node->ioss_InstrumentTable->instr, + &node->ioss_SharedInfo->winstrument[i].worker_table_instr); + } } diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index bd83e4712b3..281b92c2299 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -85,7 +85,9 @@ IndexNext(IndexScanState *node) ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; + ItemPointer tid; TupleTableSlot *slot; + bool found; /* * extract necessary information from index scan node @@ -130,8 +132,24 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + if (node->iss_InstrumentTable) + InstrPushStack(&node->iss_InstrumentTable->instr); + + for (;;) + { + found = index_fetch_heap(scandesc, slot); + if (found || !scandesc->xs_heap_continue) + break; + } + + if (node->iss_InstrumentTable) + InstrPopStack(&node->iss_InstrumentTable->instr); + + if (unlikely(!found)) + continue; + CHECK_FOR_INTERRUPTS(); /* @@ -259,36 +277,67 @@ IndexNextWithReorder(IndexScanState *node) } /* - * Fetch next tuple from the index. + * Fetch next valid tuple from the index. */ -next_indextuple: - if (!index_getnext_slot(scandesc, ForwardScanDirection, slot)) + for (;;) { + ItemPointer tid; + bool found; + + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scandesc, ForwardScanDirection); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + { + /* + * No more tuples from the index. But we still need to drain + * any remaining tuples from the queue before we're done. + */ + node->iss_ReachedEnd = true; + break; + } + + Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid)); + + if (node->iss_InstrumentTable) + InstrPushStack(&node->iss_InstrumentTable->instr); + + for (;;) + { + found = index_fetch_heap(scandesc, slot); + if (found || !scandesc->xs_heap_continue) + break; + } + + if (node->iss_InstrumentTable) + InstrPopStack(&node->iss_InstrumentTable->instr); + /* - * No more tuples from the index. But we still need to drain any - * remaining tuples from the queue before we're done. + * If the index was lossy, we have to recheck the index quals and + * ORDER BY expressions using the fetched tuple. */ - node->iss_ReachedEnd = true; - continue; - } - - /* - * If the index was lossy, we have to recheck the index quals and - * ORDER BY expressions using the fetched tuple. - */ - if (scandesc->xs_recheck) - { - econtext->ecxt_scantuple = slot; - if (!ExecQualAndReset(node->indexqualorig, econtext)) + if (found && scandesc->xs_recheck) { - /* Fails recheck, so drop it and loop back for another */ - InstrCountFiltered2(node, 1); - /* allow this loop to be cancellable */ - CHECK_FOR_INTERRUPTS(); - goto next_indextuple; + econtext->ecxt_scantuple = slot; + if (!ExecQualAndReset(node->indexqualorig, econtext)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + /* allow this loop to be cancellable */ + CHECK_FOR_INTERRUPTS(); + continue; + } } + + if (found) + break; } + /* No more index entries, re-run to clear the reorder queue */ + if (node->iss_ReachedEnd) + continue; + if (scandesc->xs_recheckorderby) { econtext->ecxt_scantuple = slot; @@ -814,6 +863,10 @@ ExecEndIndexScan(IndexScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->iss_Instrument.nsearches; + if (node->iss_InstrumentTable) + { + InstrAccumStack(&winstrument->worker_table_instr, &node->iss_InstrumentTable->instr); + } } /* @@ -1822,4 +1875,12 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->iss_SharedInfo = palloc(size); memcpy(node->iss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's entry */ + if (node->iss_InstrumentTable) + for (int i = 0; i < node->iss_SharedInfo->num_workers; i++) + { + InstrAccumStack(&node->iss_InstrumentTable->instr, + &node->iss_SharedInfo->winstrument[i].worker_table_instr); + } } diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 8847d7f94fa..d6dd46692ef 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -18,6 +18,8 @@ #ifndef INSTRUMENT_NODE_H #define INSTRUMENT_NODE_H +#include "executor/instrument.h" + /* --------------------- * Instrumentation information for aggregate function execution @@ -48,6 +50,9 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + + /* Used for passing i(o)ss_InstrumentTable data from parallel workers */ + Instrumentation worker_table_instr; } IndexScanInstrumentation; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index fbf13683581..cd6736acc80 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1737,6 +1737,13 @@ typedef struct IndexScanState IndexScanInstrumentation iss_Instrument; SharedIndexScanInstrumentation *iss_SharedInfo; + /* + * Instrumentation utilized for tracking table access. This is separate + * from iss_Instrument since it needs to be allocated in the right context + * and IndexScanInstrumentation shouldn't contain pointers. + */ + NodeInstrumentation *iss_InstrumentTable; + /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; bool iss_ReachedEnd; @@ -1787,6 +1794,14 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; IndexScanInstrumentation ioss_Instrument; SharedIndexScanInstrumentation *ioss_SharedInfo; + + /* + * Instrumentation utilized for tracking table access. This is separate + * from ioss_Instrument since it needs to be allocated in the right + * context and IndexScanInstrumentation shouldn't contain pointers. + */ + NodeInstrumentation *ioss_InstrumentTable; + TupleTableSlot *ioss_TableSlot; Buffer ioss_VMBuffer; Size ioss_PscanLen; -- 2.47.1