From 78347b717da8b95c9eb917f602045f4b29bfa215 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Mon, 6 Apr 2026 01:20:45 -0700 Subject: [PATCH v16 06/10] Optimize measuring WAL/buffer usage through stack-based instrumentation Previously, in order to determine the buffer/WAL usage of a given code section, we utilized continuously incrementing global counters that get updated when the actual activity (e.g. shared block read) occurred, and then calculated a diff when the code section ended. This resulted in a bottleneck for executor node instrumentation specifically, with the function BufferUsageAccumDiff showing up in profiles and in some cases adding up to 10% overhead to an EXPLAIN (ANALYZE, BUFFERS) run. Instead, introduce a stack-based mechanism, where the actual activity writes into the current stack entry. In the case of executor nodes, this means that each node gets its own stack entry that is pushed at InstrStartNode, and popped at InstrEndNode. Stack entries are zero initialized (avoiding the diff mechanism) and get added to their parent entry when they are finalized, i.e. no more modifications can occur. To correctly handle abort situations, any use of instrumentation stacks must involve either a top-level QueryInstrumentation struct, and its associated InstrQueryStart/InstrQueryStop helpers (which use resource owners to handle aborts), or the Instrumentation struct itself with dedicated PG_TRY/PG_FINALLY calls that ensure the stack is in a consistent state after an abort. In tests, the stack-based instrumentation mechanism reduces the overhead of EXPLAIN (ANALYZE, BUFFERS ON, TIMING OFF) for a large COUNT(*) query from about 50% to 22% on top of the actual runtime. This also drops the global pgBufferUsage, any callers interested in measuring buffer activity should instead utilize InstrStart/InstrStop. The related global pgWalUsage is kept for now due to its use in pgstat to track aggregate WAL activity and heap_page_prune_and_freeze for measuring FPIs. Author: Lukas Fittl Reviewed-by: Zsolt Parragi Reviewed-by: Heikki Linnakangas Discussion: https://www.postgresql.org/message-id/flat/CAP53PkxrmpECzVFpeeEEHDGe6u625s%2BYkmVv5-gw3L_NDSfbiA%40mail.gmail.com#cb583a08e8e096aa1f093bb178906173 --- .../pg_stat_statements/pg_stat_statements.c | 6 +- src/backend/access/brin/brin.c | 10 +- src/backend/access/gin/gininsert.c | 10 +- src/backend/access/heap/vacuumlazy.c | 12 +- src/backend/access/nbtree/nbtsort.c | 10 +- src/backend/commands/analyze.c | 12 +- src/backend/commands/explain.c | 10 +- src/backend/commands/explain_dr.c | 2 + src/backend/commands/prepare.c | 10 +- src/backend/commands/repack.c | 2 +- src/backend/commands/tablecmds.c | 5 +- src/backend/commands/vacuumparallel.c | 10 +- src/backend/executor/README.instrument | 237 ++++++++++ src/backend/executor/execMain.c | 94 +++- src/backend/executor/execParallel.c | 32 +- src/backend/executor/execPartition.c | 5 +- src/backend/executor/execProcnode.c | 106 ++++- src/backend/executor/execUtils.c | 13 +- src/backend/executor/instrument.c | 429 ++++++++++++++---- src/backend/replication/logical/worker.c | 2 +- src/backend/storage/buffer/bufmgr.c | 6 +- src/backend/utils/activity/pgstat_io.c | 6 +- src/include/executor/executor.h | 6 +- src/include/executor/instrument.h | 199 +++++++- src/include/nodes/execnodes.h | 2 + src/include/utils/resowner.h | 1 + src/tools/pgindent/typedefs.list | 2 + 27 files changed, 1045 insertions(+), 194 deletions(-) create mode 100644 src/backend/executor/README.instrument diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 5da71c9be16..a9cd1150ebb 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -924,12 +924,11 @@ pgss_planner(Query *parse, } PG_FINALLY(); { + InstrStopFinalize(&instr); nesting_level--; } PG_END_TRY(); - InstrStop(&instr); - pgss_store(query_string, parse->queryId, parse->stmt_location, @@ -1140,6 +1139,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, } PG_FINALLY(); { + InstrStopFinalize(&instr); nesting_level--; } PG_END_TRY(); @@ -1154,8 +1154,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, * former value, which'd otherwise be a good idea. */ - InstrStop(&instr); - /* * Track the total number of rows retrieved or affected by the utility * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index bdb30752e09..3a5176c76c7 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2434,8 +2434,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * and PARALLEL_KEY_BUFFER_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(WalUsage), pcxt->nworkers)); @@ -2887,6 +2887,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + QueryInstrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2936,7 +2937,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2951,7 +2952,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 9d83a495775..0d80f72a0b0 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -991,8 +991,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * and PARALLEL_KEY_BUFFER_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(WalUsage), pcxt->nworkers)); @@ -2118,6 +2118,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + QueryInstrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2186,7 +2187,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2201,7 +2202,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 6173e53c4ad..dd285863062 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -638,7 +638,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params, TimestampTz starttime = 0; PgStat_Counter startreadtime = 0, startwritetime = 0; - Instrumentation *instr = NULL; + QueryInstrumentation *instr = NULL; ErrorContextCallback errcallback; char **indnames = NULL; Size dead_items_max_bytes = 0; @@ -654,8 +654,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params, startreadtime = pgStatBlockReadTime; startwritetime = pgStatBlockWriteTime; } - instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); - InstrStart(instr); + instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrQueryStart(instr); } /* Used for instrumentation and stats report */ @@ -997,7 +997,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params, { TimestampTz endtime = GetCurrentTimestamp(); - InstrStop(instr); + InstrQueryStopFinalize(instr); if (verbose || params->log_vacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, @@ -1013,8 +1013,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; - BufferUsage bufferusage = instr->bufusage; - WalUsage walusage = instr->walusage; + BufferUsage bufferusage = instr->instr.bufusage; + WalUsage walusage = instr->instr.walusage; TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 756dfa3dcf4..2d7b7cef912 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1466,8 +1466,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * and PARALLEL_KEY_BUFFER_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(WalUsage), pcxt->nworkers)); @@ -1753,6 +1753,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + QueryInstrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -1828,7 +1829,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) } /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Perform sorting of spool, and possibly a spool2 */ sortmem = maintenance_work_mem / btshared->scantuplesortstates; @@ -1838,7 +1839,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); #ifdef BTREE_BUILD_STATS diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 8472fc0c280..10f8a2dc81c 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -309,7 +309,7 @@ do_analyze_rel(Relation onerel, const VacuumParams *params, Oid save_userid; int save_sec_context; int save_nestlevel; - Instrumentation *instr = NULL; + QueryInstrumentation *instr = NULL; PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; @@ -361,8 +361,8 @@ do_analyze_rel(Relation onerel, const VacuumParams *params, pg_rusage_init(&ru0); - instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); - InstrStart(instr); + instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrQueryStart(instr); } /* Used for instrumentation and stats report */ @@ -743,7 +743,7 @@ do_analyze_rel(Relation onerel, const VacuumParams *params, { TimestampTz endtime = GetCurrentTimestamp(); - InstrStop(instr); + InstrQueryStopFinalize(instr); if (verbose || params->log_analyze_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, @@ -757,8 +757,8 @@ do_analyze_rel(Relation onerel, const VacuumParams *params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; - BufferUsage bufusage = instr->bufusage; - WalUsage walusage = instr->walusage; + BufferUsage bufusage = instr->instr.bufusage; + WalUsage walusage = instr->instr.walusage; total_blks_hit = bufusage.shared_blks_hit + bufusage.local_blks_hit; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index deaaba6f900..e2b1d343cca 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -324,7 +324,7 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, QueryEnvironment *queryEnv) { PlannedStmt *plan; - Instrumentation plan_instr = {0}; + QueryInstrumentation *plan_instr = NULL; int instrument_options = INSTRUMENT_TIMER; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; @@ -333,7 +333,7 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, if (es->buffers) instrument_options |= INSTRUMENT_BUFFERS; - InstrInitOptions(&plan_instr, instrument_options); + plan_instr = InstrQueryAlloc(instrument_options); if (es->memory) { @@ -351,12 +351,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - InstrStart(&plan_instr); + InstrQueryStart(plan_instr); /* plan the query */ plan = pg_plan_query(query, queryString, cursorOptions, params, es); - InstrStop(&plan_instr); + InstrQueryStopFinalize(plan_instr); if (es->memory) { @@ -366,7 +366,7 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, /* run it (if needed) and produce output */ ExplainOnePlan(plan, into, es, queryString, params, queryEnv, - &plan_instr.total, (es->buffers ? &plan_instr.bufusage : NULL), + &plan_instr->instr.total, (es->buffers ? &plan_instr->instr.bufusage : NULL), es->memory ? &mem_counters : NULL); } diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c index df5ae5f4569..836395d6992 100644 --- a/src/backend/commands/explain_dr.c +++ b/src/backend/commands/explain_dr.c @@ -236,6 +236,8 @@ serializeAnalyzeShutdown(DestReceiver *self) { SerializeDestReceiver *receiver = (SerializeDestReceiver *) self; + InstrFinalizeChild(&receiver->metrics.instr, instr_stack.current); + if (receiver->finfos) pfree(receiver->finfos); receiver->finfos = NULL; diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index bf9f2eb6149..ee811357588 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -581,7 +581,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, ListCell *p; ParamListInfo paramLI = NULL; EState *estate = NULL; - Instrumentation plan_instr = {0}; + QueryInstrumentation *plan_instr = NULL; int instrument_options = INSTRUMENT_TIMER; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; @@ -590,7 +590,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, if (es->buffers) instrument_options |= INSTRUMENT_BUFFERS; - InstrInitOptions(&plan_instr, instrument_options); + plan_instr = InstrQueryAlloc(instrument_options); if (es->memory) { @@ -602,7 +602,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - InstrStart(&plan_instr); + InstrQueryStart(plan_instr); /* Look it up in the hash table */ entry = FetchPreparedStatement(execstmt->name, true); @@ -637,7 +637,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, cplan = GetCachedPlan(entry->plansource, paramLI, CurrentResourceOwner, pstate->p_queryEnv); - InstrStop(&plan_instr); + InstrQueryStopFinalize(plan_instr); if (es->memory) { @@ -654,7 +654,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, if (pstmt->commandType != CMD_UTILITY) ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv, - &plan_instr.total, (es->buffers ? &plan_instr.bufusage : NULL), + &plan_instr->instr.total, (es->buffers ? &plan_instr->instr.bufusage : NULL), es->memory ? &mem_counters : NULL); else ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI); diff --git a/src/backend/commands/repack.c b/src/backend/commands/repack.c index 20dad22c4b7..b2ff5d2ff44 100644 --- a/src/backend/commands/repack.c +++ b/src/backend/commands/repack.c @@ -2890,7 +2890,7 @@ initialize_change_context(ChangeContext *chgcxt, chgcxt->cc_estate = CreateExecutorState(); chgcxt->cc_rri = (ResultRelInfo *) palloc(sizeof(ResultRelInfo)); - InitResultRelInfo(chgcxt->cc_rri, relation, 0, 0, 0); + InitResultRelInfo(chgcxt->cc_rri, relation, 0, 0, 0, NULL); ExecOpenIndices(chgcxt->cc_rri, false); /* diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index eec09ba1ded..f86e4ac67cd 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -2139,7 +2139,7 @@ ExecuteTruncateGuts(List *explicit_rels, rel, 0, /* dummy rangetable index */ NULL, - 0); + 0, NULL); estate->es_opened_result_relations = lappend(estate->es_opened_result_relations, resultRelInfo); resultRelInfo++; @@ -6338,7 +6338,8 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap) oldrel, 0, /* dummy rangetable index */ NULL, - estate->es_instrument); + estate->es_instrument, + estate->es_query_instr); MemoryContextSwitchTo(oldcontext); } diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 979c2be4abd..82bfbc6d492 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -369,8 +369,8 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgBufferUsage or - * pgWalUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(BufferUsage), pcxt->nworkers)); @@ -1202,6 +1202,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PVIndStats *indstats; PVShared *shared; TidStore *dead_items; + QueryInstrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -1305,7 +1306,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) error_context_stack = &errcallback; /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Process indexes to perform vacuum/cleanup */ parallel_vacuum_process_safe_indexes(&pvs); @@ -1313,7 +1314,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution */ buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report any remaining cost-based vacuum delay time */ diff --git a/src/backend/executor/README.instrument b/src/backend/executor/README.instrument new file mode 100644 index 00000000000..7df837dbc77 --- /dev/null +++ b/src/backend/executor/README.instrument @@ -0,0 +1,237 @@ +src/backend/executor/README.instrument + +Instrumentation +=============== + +The instrumentation subsystem measures time, buffer usage and WAL activity +during query execution and other similar activities. It is used by +EXPLAIN ANALYZE, pg_stat_statements, and other consumers that need +activity and/or timing metrics over a section of code. + +The design has two central goals: + +* Make it cheap to measure activity in a section of code, even when + that section is called many times and the aggregate is what is used + (as is the case with per-node instrumentation in the executor) + +* Ensure nested instrumentation accurately measures activity/timing, + even when execution is aborted due to errors being thrown. + +The key data structures are defined in src/include/executor/instrument.h +and the implementation lives in src/backend/executor/instrument.c. + + +Instrumentation Options +----------------------- + +Callers specify what to measure with a bitmask of InstrumentOption flags: + + INSTRUMENT_ROWS -- row counts only (used with NodeInstrumentation) + INSTRUMENT_TIMER -- wall-clock timing and row counts + INSTRUMENT_BUFFERS -- buffer hit/read/dirtied/written counts and I/O time + INSTRUMENT_WAL -- WAL records, FPI, bytes + +INSTRUMENT_BUFFERS and INSTRUMENT_WAL utilize the instrumentation stack +(described below) for efficient handling of counter values. + + +Struct Hierarchy +---------------- + +There are the following instrumentation structs, each specialized for a +different scope: + +Instrumentation Base struct. Holds timing and buffer/WAL counters. + +QueryInstrumentation Extends Instrumentation for query-level tracking. When + stack-based tracking is enabled, it owns a dedicated + MemoryContext and uses the ResourceOwner mechanism for + abort cleanup. + +NodeInstrumentation Extends Instrumentation for per-plan-node statistics + (startup time, tuple counts, loop counts, etc). + +TriggerInstrumentation Extends Instrumentation with a firing count. + + +Stack-based instrumentation +=========================== + +For tracking WAL or buffer usage counters, the specialized stack-based +instrumentation is used. + +A simple approach to measuring buffer/WAL activity in a code section could be +to have a set of global counters, snapshot all the counters at the start, and +diff them at the end. But, this is expensive in practice: BufferUsage alone +has many fields, and the diff must be computed for every InstrStartNode / +InstrStopNode cycle. + +An alternative is to write counter updates directly into the struct that +should receive them, avoiding the diff. But that has two complexities: Low-level +code such as the buffer manager, has no direct pointers to higher level +structs, such as plan nodes tracking buffer usage. And instrumentation is often +nested: We might both be interested in the aggregate buffer usage of a query, and +the individual per-node details. Stack-based instrumentation solves for that: + +At all times, there is a stack that tracks which Instrumentation is currently +active. The stack is represented by instr_stack, a per-backend global +that holds a dynamic array of Instrumentation pointers. The field +instr_stack.current always points to the current stack entry that should +be updated when activity occurs. When the stack array is empty, the +current stack points to instr_top. + +For example, if a backend has two portals open, the overall nesting of +Instrumentation and their respective InstrStart/InstrStop calls creates a +tree-like structure like this: + + Session (instr_top) + | + +-- Query A (QueryInstrumentation) + | | + | +-- NestLoop (NodeInstrumentation) + | | + | +-- Seq Scan A (NodeInstrumentation) + | +-- Seq Scan B (NodeInstrumentation) + | + +-- Query B (QueryInstrumentation) + | + +-- Seq Scan C (NodeInstrumentation) + +While executing Seq Scan B, the stack looks like: + + instr_top (implicit bottom, not in the entries array) + 0: Query A + 1: NestLoop + 2: Seq Scan B <-- instr_stack.current + +When no query is running, the stack is empty (stack_size == 0) and +instr_stack.current points to instr_top. + +Any buffer or WAL counter update (via the INSTR_BUFUSAGE_* and +INSTR_WALUSAGE_* macros in the buffer manager, WAL insertion code, etc.) +writes directly into instr_stack.current. Each instrumentation node starts +zeroed, so the values it accumulates while on top of the stack represent +exactly the activity that occurred during that time. + +Every Instrumentation node (except for instr_top) has a target, or parent, it +will be accumulated into, which is typically the Instrumentation that was the +current stack entry when it was created. + +For example, when Seq Scan A gets finalized in regular execution via ExecutorFinish, +its instrumentation data gets added to the immediate parent in +the execution tree, the NestLoop, which will then get added to Query A's +QueryInstrumentation, which then accumulates to the parent. + +While we can typically think of this as a tree, the NodeInstrumentation +underneath a particular QueryInstrumentation could behave differently -- +for example, it could propagate directly to the QueryInstrumentation, in +order to not show cumulative numbers in EXPLAIN ANALYZE. + +Note these relationships are partially implicit, especially when it comes +to NodeInstrumentation. Each QueryInstrumentation maintains a list of its +unfinalized child nodes. The parent of a QueryInstrumentation itself is +determined by the stack (see below): when a query is finalized or cleaned +up on abort, its counters are accumulated to whatever entry is then current +on the stack, which is typically instr_top. + + +Finalization and Abort Safety +============================= + +Finalization is the process of rolling up a node's buffer/WAL counters to +its parent. In normal execution, nodes are pushed onto the stack when they +start and popped when they stop; at finalization time their accumulated +counters are added to the parent. + +Due to the use of longjmp for error handling, functions can exit abruptly +without executing their normal cleanup code. On abort, two things need +to happen: + +1. The stack is reset to the level saved at the start of the aborting + (sub-)transaction level. This ensures that we don't later try to update + counters on a freed stack entry. We also need to ensure that the stack + entry that was current before a particular Instrumentation started, is + current again after it stops. + +2. Finalize all affected Instrumentation nodes, rolling up their counters + to the innermost surviving Instrumentation, so that data is not lost. + +For example, if Seq Scan B aborts while the stack is: + + instr_top (implicit bottom) + 0: Query A + 1: NestLoop + 2: Seq Scan B + +The abort handler for Query A accumulates all unfinalized children (Seq +Scan A, Seq Scan B, NestLoop) directly into Query A's counters, then +unwinds the instrumentation stack and accumulates Query A's counters to +instr_top. + +Note that on abort the children do not accumulate through each other (Seq +Scan B -> NestLoop -> Query A); they all accumulate directly to their +parent QueryInstrumentation. This means the order in which children are +released does not matter -- this is important because ResourceOwner cleanup +does not guarantee a particular release order. The per-node breakdown is lost, +but the instrumentation active when the query was started (instr_top in the +above example) survives the abort, and its counters include the activity. + +If multiple QueryInstrumentations are active on the stack (e.g. nested +portals), the abort handler of each uses InstrStopFinalize() to accumulate +the statistics to the parent entry of either the entry being released, or a +previously released entry if it was higher up in the stack, so they compose +correctly regardless of release order. + +There are two mechanisms for achieving abort safety: + +* Resource Owner (QueryInstrumentation): registers with the current + ResourceOwner at start. On transaction abort, the resource owner system + calls the release callback, which walks unfinalized child entries, + accumulates their data, unwinds the stack, and destroys the dedicated + memory context (freeing the QueryInstrumentation and all child + allocations as a unit). This is the recommended approach when the + instrumented code already has an appropriate resource owner (e.g. it + runs inside a portal). The query executor uses this path. + +* PG_FINALLY (base Instrumentation): when no suitable resource owner + exists, or when the caller wants to inspect the instrumentation data + even after an error, the base Instrumentation can be used with a + PG_TRY/PG_FINALLY block that calls InstrStopFinalize(). + +Both mechanisms add overhead, so neither is suitable for high-frequency +instrumentation like per-node measurements in the executor. Instead, +plan node and trigger children rely on their parent QueryInstrumentation +for abort safety: they are allocated in the parent's memory context and +registered in its unfinalized-entries list, so the parent's abort handler +recovers their data automatically. In normal execution, children are +finalized explicitly by the caller. + +Parallel Query +-------------- + +Parallel workers get their own QueryInstrumentation so they can measure +buffer and WAL activity independently, then copy the totals into dynamic +shared memory at worker shutdown. The leader accumulates these into its +own stack. + +When per-node instrumentation is active, parallel workers skip per-node +finalization at shutdown to avoid double-counting; the per-node data is +aggregated separately through InstrAggNode(). + + +Memory Handling +=============== + +Instrumentation objects that use the stack must survive until finalization +runs, including the abort case. To ensure this, QueryInstrumentation +creates a dedicated "Instrumentation" MemoryContext (instr_cxt) as a child +of TopMemoryContext. All child instrumentation (nodes, triggers) should be +allocated in this context. + +On successful completion, instr_cxt is reparented to CurrentMemoryContext +so its lifetime is tied to the caller's context. On abort, the +ResourceOwner cleanup frees it after accumulating the instrumentation data +to the current stack entry after resetting the stack. + +When the stack is not needed (timer/rows only), Instrumentation allocations +happen in CurrentMemoryContext instead of TopMemoryContext. diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index f71f668883c..44d4fea76eb 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -78,6 +78,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL; /* decls for local routines only used within this module */ static void InitPlan(QueryDesc *queryDesc, int eflags); static void CheckValidRowMarkRel(Relation rel, RowMarkType markType); +static void ExecFinalizeTriggerInstrumentation(EState *estate); static void ExecPostprocessPlan(EState *estate); static void ExecEndPlan(PlanState *planstate, EState *estate); static void ExecutePlan(QueryDesc *queryDesc, @@ -254,10 +255,18 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) * Set up query-level instrumentation if extensions have requested it via * totaltime_options. Ensure an extension has not allocated totaltime * itself. + * + * Alternatively, also set it up when running EXPLAIN (ANALYZE), as we + * utilize totaltime as the parent for node and trigger instrumentation. */ Assert(queryDesc->totaltime == NULL); - if (queryDesc->totaltime_options) - queryDesc->totaltime = InstrAlloc(queryDesc->totaltime_options); + if (queryDesc->totaltime_options || queryDesc->instrument_options) + { + estate->es_query_instr = InstrQueryAlloc(queryDesc->instrument_options | + queryDesc->totaltime_options); + + queryDesc->totaltime = &estate->es_query_instr->instr; + } /* * Set up an AFTER-trigger statement context, unless told not to, or @@ -340,9 +349,9 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - /* Allow instrumentation of Executor overall runtime */ - if (queryDesc->totaltime) - InstrStart(queryDesc->totaltime); + /* Start up instrumentation for this execution run */ + if (estate->es_query_instr) + InstrQueryStart(estate->es_query_instr); /* * extract information from the query descriptor and the query feature. @@ -393,8 +402,8 @@ standard_ExecutorRun(QueryDesc *queryDesc, if (sendTuples) dest->rShutdown(dest); - if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + if (estate->es_query_instr) + InstrQueryStop(estate->es_query_instr); MemoryContextSwitchTo(oldcontext); } @@ -443,8 +452,8 @@ standard_ExecutorFinish(QueryDesc *queryDesc) oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); /* Allow instrumentation of Executor overall runtime */ - if (queryDesc->totaltime) - InstrStart(queryDesc->totaltime); + if (estate->es_query_instr) + InstrQueryStart(estate->es_query_instr); /* Run ModifyTable nodes to completion */ ExecPostprocessPlan(estate); @@ -453,8 +462,29 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); - if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + if (estate->es_query_instr) + { + /* + * Accumulate per-node and trigger statistics to their respective + * parent instrumentation stacks. + * + * We skip this in parallel workers because their per-node stats are + * reported individually via ExecParallelReportInstrumentation, and + * the leader's own ExecFinalizeNodeInstrumentation handles + * propagation. If we accumulated here, the leader would + * double-count: worker parent nodes would already include their + * children's stats, and then the leader's accumulation would add the + * children again. + */ + if (!IsParallelWorker() && estate->es_instrument) + { + ExecFinalizeNodeInstrumentation(queryDesc->planstate); + + ExecFinalizeTriggerInstrumentation(estate); + } + + InstrQueryStopFinalize(estate->es_query_instr); + } MemoryContextSwitchTo(oldcontext); @@ -1272,7 +1302,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, ResultRelInfo *partition_root_rri, - int instrument_options) + int instrument_options, + QueryInstrumentation *qinstr) { MemSet(resultRelInfo, 0, sizeof(ResultRelInfo)); resultRelInfo->type = T_ResultRelInfo; @@ -1293,8 +1324,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, palloc0_array(FmgrInfo, n); resultRelInfo->ri_TrigWhenExprs = (ExprState **) palloc0_array(ExprState *, n); - if (instrument_options) - resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options); + if (qinstr) + resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(qinstr, instrument_options, n); } else { @@ -1367,6 +1398,10 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, * also provides a way for EXPLAIN ANALYZE to report the runtimes of such * triggers.) So we make additional ResultRelInfo's as needed, and save them * in es_trig_target_relations. + * + * Note: if new relation lists are searched here, they must also be added to + * ExecFinalizeTriggerInstrumentation so that trigger instrumentation data + * is properly accumulated. */ ResultRelInfo * ExecGetTriggerResultRel(EState *estate, Oid relid, @@ -1433,7 +1468,8 @@ ExecGetTriggerResultRel(EState *estate, Oid relid, rel, 0, /* dummy rangetable index */ rootRelInfo, - estate->es_instrument); + estate->es_instrument, + estate->es_query_instr); estate->es_trig_target_relations = lappend(estate->es_trig_target_relations, rInfo); MemoryContextSwitchTo(oldcontext); @@ -1496,7 +1532,8 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo) /* dummy rangetable index */ InitResultRelInfo(rInfo, ancRel, 0, NULL, - estate->es_instrument); + estate->es_instrument, + estate->es_query_instr); ancResultRels = lappend(ancResultRels, rInfo); } ancResultRels = lappend(ancResultRels, rootRelInfo); @@ -1509,6 +1546,30 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo) return resultRelInfo->ri_ancestorResultRels; } +static void +ExecFinalizeTriggerInstrumentation(EState *estate) +{ + List *rels = NIL; + + rels = list_concat(rels, estate->es_tuple_routing_result_relations); + rels = list_concat(rels, estate->es_opened_result_relations); + rels = list_concat(rels, estate->es_trig_target_relations); + + foreach_node(ResultRelInfo, rInfo, rels) + { + TriggerInstrumentation *ti = rInfo->ri_TrigInstrument; + + if (ti == NULL || rInfo->ri_TrigDesc == NULL) + continue; + + for (int nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++) + { + if (ti[nt].instr.need_stack) + InstrAccumStack(&estate->es_query_instr->instr, &ti[nt].instr); + } + } +} + /* ---------------------------------------------------------------- * ExecPostprocessPlan * @@ -3066,6 +3127,7 @@ EvalPlanQualStart(EPQState *epqstate, Plan *planTree) /* es_trig_target_relations must NOT be copied */ rcestate->es_top_eflags = parentestate->es_top_eflags; rcestate->es_instrument = parentestate->es_instrument; + rcestate->es_query_instr = parentestate->es_query_instr; /* es_auxmodifytables must NOT be copied */ /* diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 1a5ec0c305f..f32aa660294 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -719,7 +719,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, * * If EXPLAIN is not in use and there are no extensions loaded that care, * we could skip this. But we have no way of knowing whether anyone's - * looking at pgBufferUsage, so do it unconditionally. + * looking at instrumentation, so do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(BufferUsage), pcxt->nworkers)); @@ -1100,14 +1100,28 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, instrument = GetInstrumentationArray(instrumentation); instrument += i * instrumentation->num_workers; for (n = 0; n < instrumentation->num_workers; ++n) + { InstrAggNode(planstate->instrument, &instrument[n]); + /* + * Also add worker WAL usage to the global pgWalUsage counter. + * + * When per-node instrumentation is active, parallel workers skip + * ExecFinalizeNodeInstrumentation (to avoid double-counting in + * EXPLAIN), so per-node WAL activity is not rolled up into the + * query-level stats that InstrAccumParallelQuery receives. Without + * this, pgWalUsage would under-report WAL generated by parallel + * workers when instrumentation is active. + */ + WalUsageAdd(&pgWalUsage, &instrument[n].instr.walusage); + } + /* * Also store the per-worker detail. * - * Worker instrumentation should be allocated in the same context as the - * regular instrumentation information, which is the per-query context. - * Switch into per-query memory context. + * Ensure worker instrumentation is allocated in the per-query context. We + * don't need to place this in the instrumentation context since no more + * stack-based instrumentation work is being done. */ oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt); ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation)); @@ -1257,9 +1271,13 @@ ExecParallelCleanup(ParallelExecutorInfo *pei) { /* Accumulate instrumentation, if any. */ if (pei->instrumentation) + { ExecParallelRetrieveInstrumentation(pei->planstate, pei->instrumentation); + ExecFinalizeWorkerInstrumentation(pei->planstate); + } + /* Accumulate JIT instrumentation, if any. */ if (pei->jit_instrumentation) ExecParallelRetrieveJitInstrumentation(pei->planstate, @@ -1491,6 +1509,7 @@ void ParallelQueryMain(dsm_segment *seg, shm_toc *toc) { FixedParallelExecutorState *fpes; + QueryInstrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; DestReceiver *receiver; @@ -1551,7 +1570,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) * leader, which also doesn't count buffer accesses and WAL activity that * occur during executor startup. */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Run the plan. If we specified a tuple bound, be careful not to demand @@ -1567,7 +1586,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution. */ buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report instrumentation data if any instrumentation options are set. */ diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index d96d4f9947b..6888fbe4278 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -586,7 +586,8 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, partrel, 0, rootResultRelInfo, - estate->es_instrument); + estate->es_instrument, + estate->es_query_instr); /* * Verify result relation is a valid target for an INSERT. An UPDATE of a @@ -1381,7 +1382,7 @@ ExecInitPartitionDispatchInfo(EState *estate, { ResultRelInfo *rri = makeNode(ResultRelInfo); - InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0); + InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0, NULL); proute->nonleaf_partitions[dispatchidx] = rri; } else diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 7c4c66e323f..5ca8d91344b 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -122,6 +122,8 @@ static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context); +static bool ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -413,7 +415,8 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) - result->instrument = InstrAllocNode(estate->es_instrument, + result->instrument = InstrAllocNode(estate->es_query_instr, + estate->es_instrument, result->async_capable); return result; @@ -768,10 +771,10 @@ ExecShutdownNode_walker(PlanState *node, void *context) * at least once already. We don't expect much CPU consumption during * node shutdown, but in the case of Gather or Gather Merge, we may shut * down workers at this stage. If so, their buffer usage will get - * propagated into pgBufferUsage at this point, and we want to make sure - * that it gets associated with the Gather node. We skip this if the node - * has never been executed, so as to avoid incorrectly making it appear - * that it has. + * propagated into the current instrumentation stack entry at this point, + * and we want to make sure that it gets associated with the Gather node. + * We skip this if the node has never been executed, so as to avoid + * incorrectly making it appear that it has. */ if (node->instrument && node->instrument->running) InstrStartNode(node->instrument); @@ -809,6 +812,99 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecFinalizeNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation). + * + * This must run after the cleanup done by ExecShutdownNode, and not rely on any + * resources cleaned up by it. We also expect shutdown actions to have occurred, + * e.g. parallel worker instrumentation to have been added to the leader. + */ +void +ExecFinalizeNodeInstrumentation(PlanState *node) +{ + (void) ExecFinalizeNodeInstrumentation_walker(node, instr_stack.current); +} + +static bool +ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) +{ + Instrumentation *parent = (Instrumentation *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + Assert(node->instrument != NULL); + + /* + * Recurse into children first (bottom-up accumulation), and accumulate to + * this node's instrumentation as the parent context. + */ + planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker, + &node->instrument->instr); + + InstrFinalizeChild(&node->instrument->instr, parent); + + return false; +} + +/* + * ExecFinalizeWorkerInstrumentation + * + * Accumulate per-worker instrumentation stats from child nodes into their + * parents, mirroring what ExecFinalizeNodeInstrumentation does for the + * leader's own stats. Without this, per-worker buffer/WAL stats shown by + * EXPLAIN (ANALYZE, VERBOSE) would only reflect each node's own direct + * activity, not including children. + * + * This must run after ExecParallelRetrieveInstrumentation has populated + * worker_instrument for all nodes in the parallel subtree. + */ +void +ExecFinalizeWorkerInstrumentation(PlanState *node) +{ + (void) ExecFinalizeWorkerInstrumentation_walker(node, NULL); +} + +static bool +ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context) +{ + PlanState *parent = (PlanState *) context; + int num_workers; + + if (node == NULL) + return false; + + /* + * Recurse into children first (bottom-up accumulation), passing this node + * as parent context if it has worker_instrument, otherwise pass through + * the previous parent. + */ + planstate_tree_walker(node, ExecFinalizeWorkerInstrumentation_walker, + node->worker_instrument ? (void *) node : context); + + if (!node->worker_instrument) + return false; + + num_workers = node->worker_instrument->num_workers; + + /* Accumulate this node's per-worker stats to parent's per-worker stats */ + if (parent && parent->worker_instrument) + { + int parent_workers = parent->worker_instrument->num_workers; + + for (int n = 0; n < Min(num_workers, parent_workers); n++) + InstrAccumStack(&parent->worker_instrument->instrument[n].instr, + &node->worker_instrument->instrument[n].instr); + } + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 1eb6b9f1f40..8db2b70e5fe 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -151,6 +151,7 @@ CreateExecutorState(void) estate->es_top_eflags = 0; estate->es_instrument = 0; + estate->es_query_instr = NULL; estate->es_finished = false; estate->es_exprcontexts = NIL; @@ -227,6 +228,15 @@ FreeExecutorState(EState *estate) estate->es_partition_directory = NULL; } + /* + * Make sure the instrumentation context gets freed. This usually gets + * re-parented under the per-query context in InstrQueryStopFinalize, but + * that won't happen during EXPLAIN (BUFFERS) since ExecutorFinish never + * gets called, so we would otherwise leak it in TopMemoryContext. + */ + if (estate->es_query_instr && estate->es_query_instr->instr.need_stack) + MemoryContextDelete(estate->es_query_instr->instr_cxt); + /* * Free the per-query memory context, thereby releasing all working * memory, including the EState node itself. @@ -913,7 +923,8 @@ ExecInitResultRelation(EState *estate, ResultRelInfo *resultRelInfo, resultRelationDesc, rti, NULL, - estate->es_instrument); + estate->es_instrument, + estate->es_query_instr); if (estate->es_result_relations == NULL) estate->es_result_relations = (ResultRelInfo **) diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index dd08fc99fb2..ef1a94800f3 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -20,31 +20,53 @@ #include "nodes/execnodes.h" #include "portability/instr_time.h" #include "utils/guc_hooks.h" +#include "utils/memutils.h" +#include "utils/resowner.h" -BufferUsage pgBufferUsage; -static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; -static WalUsage save_pgWalUsage; +Instrumentation instr_top; +InstrStackState instr_stack = { + .stack_space = 0, + .stack_size = 0, + .entries = NULL, + .current = &instr_top, +}; -static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); -static void WalUsageAdd(WalUsage *dst, WalUsage *add); +void +InstrStackGrow(void) +{ + int space = instr_stack.stack_space; + + Assert(instr_stack.stack_size >= instr_stack.stack_space); + + if (instr_stack.entries == NULL) + { + space = 10; /* Allocate sufficient initial space for + * typical activity */ + instr_stack.entries = MemoryContextAlloc(TopMemoryContext, + sizeof(Instrumentation *) * space); + } + else + { + space *= 2; + instr_stack.entries = repalloc_array(instr_stack.entries, Instrumentation *, space); + } + /* Update stack space after allocation succeeded to protect against OOMs */ + instr_stack.stack_space = space; +} /* General purpose instrumentation handling */ -Instrumentation * -InstrAlloc(int instrument_options) +static inline bool +InstrNeedStack(int instrument_options) { - Instrumentation *instr = palloc0_object(Instrumentation); - - InstrInitOptions(instr, instrument_options); - return instr; + return (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)) != 0; } void InstrInitOptions(Instrumentation *instr, int instrument_options) { - instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; - instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; + instr->need_stack = InstrNeedStack(instrument_options); instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; } @@ -59,12 +81,8 @@ InstrStart(Instrumentation *instr) INSTR_TIME_SET_CURRENT_FAST(instr->starttime); } - /* save buffer usage totals at start, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; - - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + if (instr->need_stack) + InstrPushStack(instr); } /* @@ -88,14 +106,9 @@ InstrStopCommon(Instrumentation *instr, instr_time *accum_time) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since InstrStart to the totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); - - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + /* pop the stack, unless InstrStopFinalize previously cleaned up */ + if (instr->on_stack) + InstrPopStack(instr); } void @@ -104,16 +117,279 @@ InstrStop(Instrumentation *instr) InstrStopCommon(instr, &instr->total); } +/* + * Stops instrumentation, finalizes the stack entry and accumulates to its parent. + * + * Note that this intentionally allows passing a stack that is not the current + * top, as can happen with PG_FINALLY, or resource owners, which don't have a + * guaranteed cleanup order. + */ +void +InstrStopFinalize(Instrumentation *instr) +{ + /* + * If our current node is on the stack, make sure we reset the stack to + * the parent of whichever of the released stack entries has the lowest + * index + */ + if (instr->on_stack) + { + int idx = -1; + + for (int i = instr_stack.stack_size - 1; i >= 0; i--) + { + if (instr_stack.entries[i] == instr) + { + idx = i; + break; + } + } + + if (idx < 0) + elog(ERROR, "instrumentation entry not found on stack"); + + /* Clear on_stack for any intermediate entries we're skipping over */ + for (int i = instr_stack.stack_size - 1; i > idx; i--) + instr_stack.entries[i]->on_stack = false; + + while (instr_stack.stack_size > idx + 1) + instr_stack.stack_size--; + } + + InstrStop(instr); + + /* + * Accumulate all instrumentation to the currently active instrumentation, + * so that callers get a complete picture of activity, even after an abort + */ + InstrAccumStack(instr_stack.current, instr); +} + +/* + * Finalize child instrumentation by accumulating buffer/WAL usage to the + * provided instrumentation, which may be the current entry, or one the caller + * treats as a parent and will add to the totals later. + * + * Also deletes the unfinalized entry to avoid double counting in an abort + * situation, e.g. during executor finish. + */ +void +InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent) +{ + if (instr->need_stack) + { + if (!dlist_node_is_detached(&instr->unfinalized_entry)) + dlist_delete_thoroughly(&instr->unfinalized_entry); + + InstrAccumStack(parent, instr); + } +} + + +/* Query instrumentation handling */ + +/* + * Use ResourceOwner mechanism to correctly reset instr_stack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_INSTRUMENTATION, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr) +{ + ResourceOwnerRemember(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr) +{ + ResourceOwnerForget(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc); +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + QueryInstrumentation *qinstr = (QueryInstrumentation *) DatumGetPointer(res); + MemoryContext instr_cxt = qinstr->instr_cxt; + dlist_mutable_iter iter; + + /* Accumulate data from all unfinalized child entries (nodes, triggers) */ + dlist_foreach_modify(iter, &qinstr->unfinalized_entries) + { + Instrumentation *child = dlist_container(Instrumentation, unfinalized_entry, iter.cur); + + InstrAccumStack(&qinstr->instr, child); + } + + /* Ensure the stack is reset as expected, and we accumulate to the parent */ + InstrStopFinalize(&qinstr->instr); + + /* + * Destroy the dedicated instrumentation context, which frees the + * QueryInstrumentation and all child allocations. + */ + MemoryContextDelete(instr_cxt); +} + +QueryInstrumentation * +InstrQueryAlloc(int instrument_options) +{ + QueryInstrumentation *instr; + MemoryContext instr_cxt; + + /* + * When the instrumentation stack is used, create a dedicated memory + * context for this query's instrumentation allocations. This context is a + * child of TopMemoryContext so it survives transaction abort — + * ResourceOwner release needs to access it. + * + * For simpler cases (timer/rows only), use the current memory context. + * + * All child instrumentation allocations (nodes, triggers, etc) must be + * allocated within this context to ensure correct clean up on abort. + */ + if (InstrNeedStack(instrument_options)) + instr_cxt = AllocSetContextCreate(TopMemoryContext, + "Instrumentation", + ALLOCSET_SMALL_SIZES); + else + instr_cxt = CurrentMemoryContext; + + instr = MemoryContextAllocZero(instr_cxt, sizeof(QueryInstrumentation)); + instr->instrument_options = instrument_options; + instr->instr_cxt = instr_cxt; + + InstrInitOptions(&instr->instr, instrument_options); + dlist_init(&instr->unfinalized_entries); + + return instr; +} + +void +InstrQueryStart(QueryInstrumentation *qinstr) +{ + InstrStart(&qinstr->instr); + + if (qinstr->instr.need_stack) + { + Assert(CurrentResourceOwner != NULL); + qinstr->owner = CurrentResourceOwner; + + ResourceOwnerEnlarge(qinstr->owner); + ResourceOwnerRememberInstrumentation(qinstr->owner, qinstr); + } +} + +void +InstrQueryStop(QueryInstrumentation *qinstr) +{ + InstrStop(&qinstr->instr); + + if (qinstr->instr.need_stack) + { + Assert(qinstr->owner != NULL); + ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr); + qinstr->owner = NULL; + } +} + +void +InstrQueryStopFinalize(QueryInstrumentation *qinstr) +{ + InstrStopFinalize(&qinstr->instr); + + if (!qinstr->instr.need_stack) + { + Assert(qinstr->owner == NULL); + return; + } + + Assert(qinstr->owner != NULL); + ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr); + qinstr->owner = NULL; + + /* + * Reparent the dedicated instrumentation context under the current memory + * context, so that its lifetime is now tied to the caller's context + * rather than TopMemoryContext. + */ + MemoryContextSetParent(qinstr->instr_cxt, CurrentMemoryContext); +} + +/* + * Register a child Instrumentation entry for abort processing. + * + * On abort, ResOwnerReleaseInstrumentation will walk the parent's list to + * recover buffer/WAL data from entries that were never finalized, in order for + * aggregate totals to be accurate despite the query erroring out. + */ +void +InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *child) +{ + if (child->need_stack) + dlist_push_head(&parent->unfinalized_entries, &child->unfinalized_entry); +} + +/* start instrumentation during parallel executor startup */ +QueryInstrumentation * +InstrStartParallelQuery(void) +{ + QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + + InstrQueryStart(qinstr); + return qinstr; +} + +/* report usage after parallel executor shutdown */ +void +InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage) +{ + InstrQueryStopFinalize(qinstr); + memcpy(bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage)); + memcpy(walusage, &qinstr->instr.walusage, sizeof(WalUsage)); +} + +/* + * Accumulate work done by parallel workers in the leader's stats. + * + * Note that what gets added here effectively depends on whether per-node + * instrumentation is active. If it's active the parallel worker intentionally + * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would + * cause double counting. Instead, this only accumulates any extra activity + * outside of nodes. + * + * Otherwise this is responsible for making sure that the complete query + * activity is accumulated. + */ +void +InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +{ + BufferUsageAdd(&instr_stack.current->bufusage, bufusage); + WalUsageAdd(&instr_stack.current->walusage, walusage); + + WalUsageAdd(&pgWalUsage, walusage); +} + /* Node instrumentation handling */ /* Allocate new node instrumentation structure */ NodeInstrumentation * -InstrAllocNode(int instrument_options, bool async_mode) +InstrAllocNode(QueryInstrumentation *qinstr, int instrument_options, + bool async_mode) { - NodeInstrumentation *instr = palloc_object(NodeInstrumentation); + NodeInstrumentation *instr = MemoryContextAlloc(qinstr->instr_cxt, sizeof(NodeInstrumentation)); InstrInitNode(instr, instrument_options, async_mode); + InstrQueryRememberChild(qinstr, &instr->instr); + return instr; } @@ -133,6 +409,7 @@ InstrStartNode(NodeInstrumentation *instr) InstrStart(&instr->instr); } + /* Exit from a plan node */ inline void InstrStopNode(NodeInstrumentation *instr, double nTuples) @@ -203,8 +480,8 @@ InstrEndLoop(NodeInstrumentation *instr) if (!instr->running) return; - if (!INSTR_TIME_IS_ZERO(instr->instr.starttime)) - elog(ERROR, "InstrEndLoop called on running node"); + /* Ensure InstrNodeStop was called */ + Assert(INSTR_TIME_IS_ZERO(instr->instr.starttime)); /* Accumulate per-cycle statistics into totals */ INSTR_TIME_ADD(instr->startup, instr->firsttuple); @@ -237,22 +514,30 @@ InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) dst->nfiltered1 += add->nfiltered1; dst->nfiltered2 += add->nfiltered2; - if (dst->instr.need_bufusage) - BufferUsageAdd(&dst->instr.bufusage, &add->instr.bufusage); - - if (dst->instr.need_walusage) - WalUsageAdd(&dst->instr.walusage, &add->instr.walusage); + if (dst->instr.need_stack) + InstrAccumStack(&dst->instr, &add->instr); } /* Trigger instrumentation handling */ TriggerInstrumentation * -InstrAllocTrigger(int n, int instrument_options) +InstrAllocTrigger(QueryInstrumentation *qinstr, int instrument_options, int n) { - TriggerInstrumentation *tginstr = palloc0_array(TriggerInstrumentation, n); + TriggerInstrumentation *tginstr; int i; + /* + * Allocate in the query's dedicated instrumentation context so all + * instrumentation data is grouped together and cleaned up as a unit. + */ + Assert(qinstr != NULL && qinstr->instr_cxt != NULL); + tginstr = MemoryContextAllocZero(qinstr->instr_cxt, + n * sizeof(TriggerInstrumentation)); + for (i = 0; i < n; i++) + { InstrInitOptions(&tginstr[i].instr, instrument_options); + InstrQueryRememberChild(qinstr, &tginstr[i].instr); + } return tginstr; } @@ -266,38 +551,30 @@ InstrStartTrigger(TriggerInstrumentation *tginstr) void InstrStopTrigger(TriggerInstrumentation *tginstr, int64 firings) { + /* + * This trigger may be called again, so we don't finalize instrumentation + * here. Accumulation to the parent happens at ExecutorFinish through + * ExecFinalizeTriggerInstrumentation. + */ InstrStop(&tginstr->instr); tginstr->firings += firings; } -/* note current values during parallel executor startup */ void -InstrStartParallelQuery(void) +InstrAccumStack(Instrumentation *dst, Instrumentation *add) { - save_pgBufferUsage = pgBufferUsage; - save_pgWalUsage = pgWalUsage; -} + Assert(dst != NULL); + Assert(add != NULL); -/* report usage after parallel executor shutdown */ -void -InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) -{ - memset(bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage); - memset(walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage); -} + if (!add->need_stack) + return; -/* accumulate work done by workers in leader's stats */ -void -InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) -{ - BufferUsageAdd(&pgBufferUsage, bufusage); - WalUsageAdd(&pgWalUsage, walusage); + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); } /* dst += add */ -static void +void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) { dst->shared_blks_hit += add->shared_blks_hit; @@ -318,39 +595,9 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time); } -/* dst += add - sub */ +/* dst += add */ void -BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, - const BufferUsage *sub) -{ - dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; - dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; - dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied; - dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written; - dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit; - dst->local_blks_read += add->local_blks_read - sub->local_blks_read; - dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied; - dst->local_blks_written += add->local_blks_written - sub->local_blks_written; - dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; - dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time, - add->shared_blk_read_time, sub->shared_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time, - add->shared_blk_write_time, sub->shared_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time, - add->local_blk_read_time, sub->local_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time, - add->local_blk_write_time, sub->local_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time, - add->temp_blk_read_time, sub->temp_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time, - add->temp_blk_write_time, sub->temp_blk_write_time); -} - -/* helper functions for WAL usage accumulation */ -static void -WalUsageAdd(WalUsage *dst, WalUsage *add) +WalUsageAdd(WalUsage *dst, const WalUsage *add) { dst->wal_bytes += add->wal_bytes; dst->wal_records += add->wal_records; diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index b38170f0fbe..a829ddf5acb 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -904,7 +904,7 @@ create_edata_for_relation(LogicalRepRelMapEntry *rel) * Use Relation opened by logicalrep_rel_open() instead of opening it * again. */ - InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0); + InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0, NULL); /* * We put the ResultRelInfo in the es_opened_result_relations list, even diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 3e1c39160db..cf4f4246ca2 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1266,9 +1266,9 @@ PinBufferForBlock(Relation rel, if (rel) { /* - * While pgBufferUsage's "read" counter isn't bumped unless we reach - * WaitReadBuffers() (so, not for hits, and not for buffers that are - * zeroed instead), the per-relation stats always count them. + * While the current buffer usage "read" counter isn't bumped unless + * we reach WaitReadBuffers() (so, not for hits, and not for buffers + * that are zeroed instead), the per-relation stats always count them. */ pgstat_count_buffer_read(rel); } diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index e3829d7fe7c..e7fc7f071d8 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -114,9 +114,9 @@ pgstat_prepare_io_time(bool track_io_guc) * pg_stat_database only counts block read and write times, these are done for * IOOP_READ, IOOP_WRITE and IOOP_EXTEND. * - * pgBufferUsage is used for EXPLAIN. pgBufferUsage has write and read stats - * for shared, local and temporary blocks. pg_stat_io does not track the - * activity of temporary blocks, so these are ignored here. + * Executor instrumentation is used for EXPLAIN. Buffer usage tracked there has + * write and read stats for shared, local and temporary blocks. pg_stat_io + * does not track the activity of temporary blocks, so these are ignored here. */ void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 491c4886506..78961ae058b 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -233,6 +233,7 @@ ExecGetJunkAttribute(TupleTableSlot *slot, AttrNumber attno, bool *isNull) /* * prototypes from functions in execMain.c */ +typedef struct QueryInstrumentation QueryInstrumentation; extern void ExecutorStart(QueryDesc *queryDesc, int eflags); extern void standard_ExecutorStart(QueryDesc *queryDesc, int eflags); extern void ExecutorRun(QueryDesc *queryDesc, @@ -254,7 +255,8 @@ extern void InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, ResultRelInfo *partition_root_rri, - int instrument_options); + int instrument_options, + QueryInstrumentation *qinstr); extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid, ResultRelInfo *rootRelInfo); extern List *ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo); @@ -301,6 +303,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecFinalizeNodeInstrumentation(PlanState *node); +extern void ExecFinalizeWorkerInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 4430c222493..6ee4ce2b521 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -13,6 +13,7 @@ #ifndef INSTRUMENT_H #define INSTRUMENT_H +#include "lib/ilist.h" #include "portability/instr_time.h" @@ -68,29 +69,92 @@ typedef enum InstrumentOption } InstrumentOption; /* - * General purpose instrumentation that can capture time and WAL/buffer usage + * Instrumentation base class for capturing time and WAL/buffer usage * - * Initialized through InstrAlloc, followed by one or more calls to a pair of - * InstrStart/InstrStop (activity is measured in between). + * If used directly: + * - Allocate on the stack and zero initialize the struct + * - Call InstrInitOptions to set instrumentation options + * - Call InstrStart before the activity you want to measure + * - Call InstrStop / InstrStopFinalize after the activity to capture totals + * + * InstrStart/InstrStop may be called multiple times. The last stop call must + * be to InstrStopFinalize to ensure parent stack entries get the accumulated + * totals. If there is risk of transaction aborts you must call + * InstrStopFinalize in a PG_TRY/PG_FINALLY block to avoid corrupting the + * instrumentation stack. + * + * In a query context use QueryInstrumentation instead, which handles aborts + * using the resource owner logic. */ typedef struct Instrumentation { /* Parameters set at creation: */ bool need_timer; /* true if we need timer data */ - bool need_bufusage; /* true if we need buffer usage data */ - bool need_walusage; /* true if we need WAL usage data */ + bool need_stack; /* true if we need WAL/buffer usage data */ /* Internal state keeping: */ + bool on_stack; /* true if currently on instr_stack */ instr_time starttime; /* start time of last InstrStart */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics: */ instr_time total; /* total runtime */ BufferUsage bufusage; /* total buffer usage */ WalUsage walusage; /* total WAL usage */ + /* Abort handling: link in parent QueryInstrumentation's unfinalized list */ + dlist_node unfinalized_entry; } Instrumentation; +/* + * Query-related instrumentation tracking. + * + * Usage: + * - Allocate on the heap using InstrQueryAlloc (required for abort handling) + * - Call InstrQueryStart before the activity you want to measure + * - Call InstrQueryStop / InstrQueryStopFinalize afterwards to capture totals + * + * InstrQueryStart/InstrQueryStop may be called multiple times. The last stop + * call must be to InstrQueryStopFinalize to ensure parent stack entries get + * the accumulated totals. + * + * Uses resource owner mechanism for handling aborts, as such, the caller + * *must* not exit out of the top level transaction after having called + * InstrQueryStart, without first calling InstrQueryStop or + * InstrQueryStopFinalize. In the case of a transaction abort, logic equivalent + * to InstrQueryStopFinalize will be called automatically. + */ +struct ResourceOwnerData; +typedef struct QueryInstrumentation +{ + Instrumentation instr; + + /* Original instrument_options flags used to create this instrumentation */ + int instrument_options; + + /* Resource owner used for cleanup for aborts between InstrStart/InstrStop */ + struct ResourceOwnerData *owner; + + /* + * Dedicated memory context for all instrumentation allocations belonging + * to this query (node instrumentation, trigger instrumentation, etc.). + * Initially a child of TopMemoryContext so it survives transaction abort + * for ResourceOwner cleanup, which is then reassigned to the current + * memory context on InstrQueryStopFinalize. + */ + MemoryContext instr_cxt; + + /* + * Child entries that need to be cleaned up on abort, since they are not + * registered as a resource owner themselves. Contains both node and + * trigger instrumentation entries linked via instr.unfinalized_entry. + */ + dlist_head unfinalized_entries; +} QueryInstrumentation; + /* * Specialized instrumentation for per-node execution statistics + * + * Relies on an outer QueryInstrumentation having been set up to handle the + * stack used for WAL/buffer usage statistics, and relies on it for managing + * aborts. Solely intended for the executor and anyone reporting about its + * activities (e.g. EXPLAIN ANALYZE). */ typedef struct NodeInstrumentation { @@ -111,6 +175,10 @@ typedef struct NodeInstrumentation double nfiltered2; /* # of tuples removed by "other" quals */ } NodeInstrumentation; +/* + * Care must be taken with any pointers contained within this struct, as this + * gets copied across processes during parallel query execution. + */ typedef struct WorkerNodeInstrumentation { int num_workers; /* # of structures that follow */ @@ -124,15 +192,105 @@ typedef struct TriggerInstrumentation * was fired */ } TriggerInstrumentation; -extern PGDLLIMPORT BufferUsage pgBufferUsage; +/* + * Dynamic array-based stack for tracking current WAL/buffer usage context. + * + * When the stack is empty, 'current' points to instr_top which accumulates + * session-level totals. + */ +typedef struct InstrStackState +{ + int stack_space; /* allocated capacity of entries array */ + int stack_size; /* current number of entries */ + + Instrumentation **entries; /* dynamic array of pointers */ + Instrumentation *current; /* top of stack, or &instr_top when empty */ +} InstrStackState; + extern PGDLLIMPORT WalUsage pgWalUsage; -extern Instrumentation *InstrAlloc(int instrument_options); +/* + * The top instrumentation represents a running total of the current backend + * WAL/buffer usage information. This will not be updated immediately, but + * rather when the current stack entry gets accumulated which typically happens + * at query end. + * + * Care must be taken when utilizing this in the parallel worker context: + * Parallel workers will report back their instrumentation to the caller, + * and this gets added to the caller's stack. If this were to be used in the + * shared memory stats infrastructure it would need to be skipped on parallel + * workers to avoid double counting. + */ +extern PGDLLIMPORT Instrumentation instr_top; + +/* + * The instrumentation stack state. The 'current' field points to the + * currently active stack entry that is getting updated as activity happens, + * and will be accumulated to parent stacks when it gets finalized by + * InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation + * (executor finish) or ResOwnerReleaseInstrumentation on abort. + */ +extern PGDLLIMPORT InstrStackState instr_stack; + +extern void InstrStackGrow(void); + +/* + * Pushes the stack so that all WAL/buffer usage updates go to the passed in + * instrumentation entry. + * + * See note on InstrPopStack regarding safe use of these functions. + */ +static inline void +InstrPushStack(Instrumentation *instr) +{ + if (unlikely(instr_stack.stack_size == instr_stack.stack_space)) + InstrStackGrow(); + + instr_stack.entries[instr_stack.stack_size++] = instr; + instr_stack.current = instr; + instr->on_stack = true; +} + +/* + * Pops the stack entry back to the previous one that was effective at + * InstrPushStack. + * + * Callers must ensure that no intermediate stack entries are skipped, to + * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY + * block, consider instead using InstrStart + InstrStopFinalize which can skip + * intermediate stack entries. + */ +static inline void +InstrPopStack(Instrumentation *instr) +{ + Assert(instr_stack.stack_size > 0); + Assert(instr_stack.entries[instr_stack.stack_size - 1] == instr); + instr_stack.stack_size--; + instr_stack.current = instr_stack.stack_size > 0 + ? instr_stack.entries[instr_stack.stack_size - 1] + : &instr_top; + instr->on_stack = false; +} + extern void InstrInitOptions(Instrumentation *instr, int instrument_options); extern void InstrStart(Instrumentation *instr); extern void InstrStop(Instrumentation *instr); +extern void InstrStopFinalize(Instrumentation *instr); +extern void InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent); +extern void InstrAccumStack(Instrumentation *dst, Instrumentation *add); -extern NodeInstrumentation *InstrAllocNode(int instrument_options, +extern QueryInstrumentation *InstrQueryAlloc(int instrument_options); +extern void InstrQueryStart(QueryInstrumentation *instr); +extern void InstrQueryStop(QueryInstrumentation *instr); +extern void InstrQueryStopFinalize(QueryInstrumentation *instr); +extern void InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *instr); + +pg_nodiscard extern QueryInstrumentation *InstrStartParallelQuery(void); +extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage); +extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); + +extern NodeInstrumentation *InstrAllocNode(QueryInstrumentation *qinstr, + int instrument_options, bool async_mode); extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options, bool async_mode); @@ -146,35 +304,36 @@ typedef struct TupleTableSlot TupleTableSlot; typedef struct PlanState PlanState; extern TupleTableSlot *ExecProcNodeInstr(PlanState *node); -extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options); +extern TriggerInstrumentation *InstrAllocTrigger(QueryInstrumentation *qinstr, + int instrument_options, int n); extern void InstrStartTrigger(TriggerInstrumentation *tginstr); extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int64 firings); -extern void InstrStartParallelQuery(void); -extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, const BufferUsage *sub); +extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); +extern void WalUsageAdd(WalUsage *dst, const WalUsage *add); extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); #define INSTR_BUFUSAGE_INCR(fld) do { \ - pgBufferUsage.fld++; \ + instr_stack.current->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ - pgBufferUsage.fld += (val); \ + instr_stack.current->bufusage.fld += (val); \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ - INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + INSTR_TIME_ADD(instr_stack.current->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + INSTR_TIME_ACCUM_DIFF(instr_stack.current->bufusage.fld, endval, startval); \ } while (0) + #define INSTR_WALUSAGE_INCR(fld) do { \ pgWalUsage.fld++; \ + instr_stack.current->walusage.fld++; \ } while(0) #define INSTR_WALUSAGE_ADD(fld,val) do { \ pgWalUsage.fld += (val); \ + instr_stack.current->walusage.fld += (val); \ } while(0) #endif /* INSTRUMENT_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 3ecae7552fc..491c4e272d8 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -54,6 +54,7 @@ typedef struct Instrumentation Instrumentation; typedef struct pairingheap pairingheap; typedef struct PlanState PlanState; typedef struct QueryEnvironment QueryEnvironment; +typedef struct QueryInstrumentation QueryInstrumentation; typedef struct RelationData *Relation; typedef Relation *RelationPtr; typedef struct ScanKeyData ScanKeyData; @@ -754,6 +755,7 @@ typedef struct EState int es_top_eflags; /* eflags passed to ExecutorStart */ int es_instrument; /* OR of InstrumentOption flags */ + QueryInstrumentation *es_query_instr; /* query-level instrumentation */ bool es_finished; /* true when ExecutorFinish is done */ List *es_exprcontexts; /* List of ExprContexts within EState */ diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index eb6033b4fdb..5463bc921f0 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority; #define RELEASE_PRIO_SNAPSHOT_REFS 500 #define RELEASE_PRIO_FILES 600 #define RELEASE_PRIO_WAITEVENTSETS 700 +#define RELEASE_PRIO_INSTRUMENTATION 800 /* 0 is considered invalid */ #define RELEASE_PRIO_FIRST 1 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a998bb5e882..32b866611c9 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1361,6 +1361,7 @@ InjectionPointSharedState InjectionPointsCtl InlineCodeBlock InsertStmt +InstrStackState Instrumentation Int128AggState Int8TransTypeData @@ -2484,6 +2485,7 @@ QueryCompletion QueryDesc QueryEnvironment QueryInfo +QueryInstrumentation QueryItem QueryItemType QueryMode -- 2.47.1