From d904454f4a54a3793dcbd8812a0cbfc821fb979f Mon Sep 17 00:00:00 2001 From: Anthonin Bonnefoy Date: Mon, 15 May 2023 10:00:08 +0200 Subject: [PATCH] Add pg_tracing extension to contrib This extension add the possibility to create server-side spans for distributed tracing. When a query is sampled, the extension will create spans tracking the process of this query: Parse, Planner, Executor... During ExecutorEnd, the planstate is also converted in spans, effectively generating an explain analyze in a trace format. The spans are exposed through a function call. Sending spans to a trace collector is outside this extension's scope. --- contrib/Makefile | 1 + contrib/meson.build | 1 + contrib/pg_tracing/Makefile | 27 + contrib/pg_tracing/expected/cleanup.out | 4 + contrib/pg_tracing/expected/create_table.out | 1 + contrib/pg_tracing/expected/nested.out | 145 ++ contrib/pg_tracing/expected/parallel.out | 82 + contrib/pg_tracing/expected/reset.out | 13 + contrib/pg_tracing/expected/sample.out | 156 ++ contrib/pg_tracing/expected/select.out | 229 ++ contrib/pg_tracing/expected/subxact.out | 75 + contrib/pg_tracing/expected/utility.out | 214 ++ contrib/pg_tracing/expected/wal.out | 96 + contrib/pg_tracing/meson.build | 52 + contrib/pg_tracing/pg_tracing--1.0.sql | 99 + contrib/pg_tracing/pg_tracing.c | 2313 ++++++++++++++++++ contrib/pg_tracing/pg_tracing.conf | 1 + contrib/pg_tracing/pg_tracing.control | 5 + contrib/pg_tracing/pg_tracing.h | 43 + contrib/pg_tracing/pg_tracing_explain.c | 527 ++++ contrib/pg_tracing/pg_tracing_explain.h | 20 + contrib/pg_tracing/query_process.c | 473 ++++ contrib/pg_tracing/query_process.h | 30 + contrib/pg_tracing/span.c | 295 +++ contrib/pg_tracing/span.h | 159 ++ contrib/pg_tracing/sql/cleanup.sql | 4 + contrib/pg_tracing/sql/nested.sql | 102 + contrib/pg_tracing/sql/parallel.sql | 16 + contrib/pg_tracing/sql/reset.sql | 3 + contrib/pg_tracing/sql/sample.sql | 41 + contrib/pg_tracing/sql/select.sql | 81 + contrib/pg_tracing/sql/subxact.sql | 18 + contrib/pg_tracing/sql/utility.sql | 128 + contrib/pg_tracing/sql/wal.sql | 32 + src/backend/executor/instrument.c | 1 + src/include/executor/instrument.h | 1 + 36 files changed, 5488 insertions(+) create mode 100644 contrib/pg_tracing/Makefile create mode 100644 contrib/pg_tracing/expected/cleanup.out create mode 100644 contrib/pg_tracing/expected/create_table.out create mode 100644 contrib/pg_tracing/expected/nested.out create mode 100644 contrib/pg_tracing/expected/parallel.out create mode 100644 contrib/pg_tracing/expected/reset.out create mode 100644 contrib/pg_tracing/expected/sample.out create mode 100644 contrib/pg_tracing/expected/select.out create mode 100644 contrib/pg_tracing/expected/subxact.out create mode 100644 contrib/pg_tracing/expected/utility.out create mode 100644 contrib/pg_tracing/expected/wal.out create mode 100644 contrib/pg_tracing/meson.build create mode 100644 contrib/pg_tracing/pg_tracing--1.0.sql create mode 100644 contrib/pg_tracing/pg_tracing.c create mode 100644 contrib/pg_tracing/pg_tracing.conf create mode 100644 contrib/pg_tracing/pg_tracing.control create mode 100644 contrib/pg_tracing/pg_tracing.h create mode 100644 contrib/pg_tracing/pg_tracing_explain.c create mode 100644 contrib/pg_tracing/pg_tracing_explain.h create mode 100644 contrib/pg_tracing/query_process.c create mode 100644 contrib/pg_tracing/query_process.h create mode 100644 contrib/pg_tracing/span.c create mode 100644 contrib/pg_tracing/span.h create mode 100644 contrib/pg_tracing/sql/cleanup.sql create mode 100644 contrib/pg_tracing/sql/nested.sql create mode 100644 contrib/pg_tracing/sql/parallel.sql create mode 100644 contrib/pg_tracing/sql/reset.sql create mode 100644 contrib/pg_tracing/sql/sample.sql create mode 100644 contrib/pg_tracing/sql/select.sql create mode 100644 contrib/pg_tracing/sql/subxact.sql create mode 100644 contrib/pg_tracing/sql/utility.sql create mode 100644 contrib/pg_tracing/sql/wal.sql diff --git a/contrib/Makefile b/contrib/Makefile index da4e2316a3..721a58fcde 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -36,6 +36,7 @@ SUBDIRS = \ pg_prewarm \ pg_stat_statements \ pg_surgery \ + pg_tracing \ pg_trgm \ pgrowlocks \ pgstattuple \ diff --git a/contrib/meson.build b/contrib/meson.build index 84d4e18561..f73caf95f5 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -47,6 +47,7 @@ subdir('pgrowlocks') subdir('pg_stat_statements') subdir('pgstattuple') subdir('pg_surgery') +subdir('pg_tracing') subdir('pg_trgm') subdir('pg_visibility') subdir('pg_walinspect') diff --git a/contrib/pg_tracing/Makefile b/contrib/pg_tracing/Makefile new file mode 100644 index 0000000000..7c169da228 --- /dev/null +++ b/contrib/pg_tracing/Makefile @@ -0,0 +1,27 @@ +# contrib/pg_tracing/Makefile + +MODULE_big = pg_tracing +OBJS = \ + $(WIN32RES) \ + pg_tracing.o \ + query_process.o \ + pg_tracing_explain.o \ + span.o + +EXTENSION = pg_tracing +DATA = pg_tracing--1.0.sql +PGFILEDESC = "pg_tracing - Distributed tracing for postgres" + +REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_tracing/pg_tracing.conf +REGRESS = utility select sample subxact reset nested wal parallel cleanup + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_tracing +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_tracing/expected/cleanup.out b/contrib/pg_tracing/expected/cleanup.out new file mode 100644 index 0000000000..beb88da53f --- /dev/null +++ b/contrib/pg_tracing/expected/cleanup.out @@ -0,0 +1,4 @@ +-- Drop test table and test functions +DROP TABLE pg_tracing_test; +DROP function test_function; +DROP EXTENSION pg_tracing; diff --git a/contrib/pg_tracing/expected/create_table.out b/contrib/pg_tracing/expected/create_table.out new file mode 100644 index 0000000000..87875a113f --- /dev/null +++ b/contrib/pg_tracing/expected/create_table.out @@ -0,0 +1 @@ +CREATE TABLE pg_tracing_test (a int, b char(20)); diff --git a/contrib/pg_tracing/expected/nested.out b/contrib/pg_tracing/expected/nested.out new file mode 100644 index 0000000000..ab0cd0c5c6 --- /dev/null +++ b/contrib/pg_tracing/expected/nested.out @@ -0,0 +1,145 @@ +-- Create test function to sample +CREATE OR REPLACE FUNCTION test_function(a int) RETURNS SETOF oid AS +$BODY$ +BEGIN + RETURN QUERY SELECT oid from pg_class where oid = a; +END; +$BODY$ +LANGUAGE plpgsql; +-- Trace a statement with a function call +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000051-0000000000000051-01'*/ select test_function(1); + test_function +--------------- +(0 rows) + +-- The test function call will generate the following spans (non exhaustive list): +-- +-----------------------------------------------------------------------------------------------+ +-- | A: Select test_function(1); | +-- +-+--------++----------++----------------++---------------------------------------------------+-+ +-- |B: Parse||C: Planner||D: ExecutorStart||E: ExecutorRun | +-- +--------++----------++----------------+++-------------------------------------------------++ +-- |F: ProjectSet | +-- ++---------+------+------------------------------++ +-- |G: Result| | H: Select a from b where... | +-- +---------+ +----+--------------+----------+ +-- |I: ExecutorRun| +-- +--------------+ +-- Gather span_id, span start and span end of function call statement +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=81 and name!='Parse' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_a_id and name='Executor' and resource='Run' \gset +SELECT span_id AS span_f_id, + get_span_start_ns(span_start, span_start_ns) as span_f_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_f_end + from pg_tracing_peek_spans where parent_id=:span_e_id and name='ProjectSet' \gset +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_f_id and name='Result' \gset +SELECT span_id AS span_h_id, + get_span_start_ns(span_start, span_start_ns) as span_h_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_h_end + from pg_tracing_peek_spans where parent_id=:span_f_id and name='Select' \gset +SELECT span_id AS span_i_id, + get_span_start_ns(span_start, span_start_ns) as span_i_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_i_end + from pg_tracing_peek_spans where parent_id=:span_h_id and resource='Run' \gset +-- Check that spans' start and end are within expection +SELECT :span_a_start < :span_e_start AS top_query_before_run, + :span_a_end >= :span_e_end AS top_end_after_run_end, + :span_e_start <= :span_f_start AS top_run_start_before_project, + :span_e_end >= :span_f_end AS top_run_end_after_project_end, + :span_e_end >= :span_h_end AS top_run_end_before_select_end, + :span_e_end >= :span_i_end AS top_run_end_after_nested_run_end, + :span_i_end <= :span_h_end AS run_end_after_select_end; + top_query_before_run | top_end_after_run_end | top_run_start_before_project | top_run_end_after_project_end | top_run_end_before_select_end | top_run_end_after_nested_run_end | run_end_after_select_end +----------------------+-----------------------+------------------------------+-------------------------------+-------------------------------+----------------------------------+-------------------------- + t | t | t | t | t | t | t +(1 row) + +-- Check that the root span is the longest one +WITH max_duration AS (select max(duration) from pg_tracing_peek_spans) +SELECT duration = max_duration.max from pg_tracing_peek_spans, max_duration + where span_id = :span_a_id; + ?column? +---------- + t +(1 row) + +-- Check that ExecutorRun is attached to the nested top span +SELECT resource from pg_tracing_peek_spans where parent_id=:span_i_id order by resource; + resource +--------------------------------------------------------------------------------- + IndexOnlyScan using pg_class_oid_index on pg_class|Index Cond: (oid = '1'::oid) +(1 row) + +-- Check tracking with top tracking +set pg_tracing.track = 'top'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000052-0000000000000052-01'*/ select test_function(1); + test_function +--------------- +(0 rows) + +SELECT count(*) from pg_tracing_consume_spans where trace_id=82; + count +------- + 10 +(1 row) + +-- Check tracking with no tracking +set pg_tracing.track = 'none'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000053-0000000000000053-01'*/ select test_function(1); + test_function +--------------- +(0 rows) + +SELECT count(*) from pg_tracing_consume_spans where trace_id=83; + count +------- + 0 +(1 row) + +-- Reset tracking setting +set pg_tracing.track = 'all'; +-- Create test procedure +CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ +DECLARE + r int; +BEGIN + SELECT (i + i)::int INTO r; +END; $$ LANGUAGE plpgsql; +-- Test tracking of procedure with utility tracking enabled +set pg_tracing.track_utility=on; +/*traceparent='00-00000000000000000000000000000054-0000000000000054-01'*/ CALL sum_one(3); +SELECT resource from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + resource +--------------------- + CALL sum_one(3); + Parse + Post Parse + CALL sum_one(3); + Parse + SELECT (i + i)::int + Post Parse + Planner + Planner + Start + Run + Result + Finish + End +(14 rows) + +-- Test again with utility tracking disabled +set pg_tracing.track_utility=off; +/*traceparent='00-00000000000000000000000000000055-0000000000000055-01'*/ CALL sum_one(10); +SELECT resource from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + resource +---------- +(0 rows) + diff --git a/contrib/pg_tracing/expected/parallel.out b/contrib/pg_tracing/expected/parallel.out new file mode 100644 index 0000000000..9906a3e861 --- /dev/null +++ b/contrib/pg_tracing/expected/parallel.out @@ -0,0 +1,82 @@ +begin; +-- encourage use of parallel plans +set parallel_setup_cost=0; +set parallel_tuple_cost=0; +set min_parallel_table_scan_size=0; +set max_parallel_workers_per_gather=2; +-- Trace parallel queries +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ select 1 from pg_class limit 1; + ?column? +---------- + 1 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ select 2 from pg_class limit 2; + ?column? +---------- + 2 + 2 +(2 rows) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-00'*/ select 3 from pg_class limit 3; + ?column? +---------- + 3 + 3 + 3 +(3 rows) + +commit; +-- Check order and number of spans generated by parallel queries +-- We can't order by span start due to the parallelisation introducing randomness +SELECT name, resource from pg_tracing_consume_spans order by resource; + name | resource +------------+----------------------------------- + Executor | End + Executor | End + Executor | End + Executor | End + Executor | End + Executor | End + Executor | Finish + Executor | Finish + Executor | Finish + Executor | Finish + Executor | Finish + Executor | Finish + Gather | Gather + Gather | Gather + Limit | Limit + Limit | Limit + SeqScan | Parallel SeqScan on pg_class + SeqScan | Parallel SeqScan on pg_class + SeqScan | Parallel SeqScan on pg_class + SeqScan | Parallel SeqScan on pg_class + SeqScan | Parallel SeqScan on pg_class + SeqScan | Parallel SeqScan on pg_class + Parse | Parse + Parse | Parse + Planner | Planner + Planner | Planner + Post Parse | Post Parse + Post Parse | Post Parse + Executor | Run + Executor | Run + Executor | Run + Executor | Run + Executor | Run + Executor | Run + Executor | Start + Executor | Start + Executor | Start + Executor | Start + Executor | Start + Executor | Start + Select | Worker 0 + Select | Worker 0 + Select | Worker 1 + Select | Worker 1 + Select | select $1 from pg_class limit $2; + Select | select $1 from pg_class limit $2; +(46 rows) + diff --git a/contrib/pg_tracing/expected/reset.out b/contrib/pg_tracing/expected/reset.out new file mode 100644 index 0000000000..2e0c903121 --- /dev/null +++ b/contrib/pg_tracing/expected/reset.out @@ -0,0 +1,13 @@ +-- Check reset is working +SELECT pg_tracing_reset(); + pg_tracing_reset +------------------ + +(1 row) + +SELECT traces from pg_tracing_info; + traces +-------- + 0 +(1 row) + diff --git a/contrib/pg_tracing/expected/sample.out b/contrib/pg_tracing/expected/sample.out new file mode 100644 index 0000000000..c1161e0537 --- /dev/null +++ b/contrib/pg_tracing/expected/sample.out @@ -0,0 +1,156 @@ +-- Trace nothing +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 0.0; +-- Query with sampling flag +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000021-0000000000000021-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- Query without trace context +SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- No spans should have been generated +select count(distinct(trace_id)) from pg_tracing_peek_spans; + count +------- + 0 +(1 row) + +-- Enable full sampling +SET pg_tracing.sample_rate = 1.0; +-- Generate queries with sampling flag on, off and no trace context +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000022-0000000000000022-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000023-0000000000000023-00'*/ SELECT 2; + ?column? +---------- + 2 +(1 row) + +SELECT 3; + ?column? +---------- + 3 +(1 row) + +SELECT 4; + ?column? +---------- + 4 +(1 row) + +-- Check number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; + count +------- + 4 +(1 row) + +-- Check span order +select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; + resource | parameters +------------+------------ + SELECT $1; | $1 = 1 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | + SELECT $1; | $1 = 2 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | + SELECT $1; | $1 = 3 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | + Parse | + SELECT $1; | $1 = 4 + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | +(36 rows) + +-- Top spans should reuse generated ids and have trace_id = parent_id = span_id +select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns; + resource | parameters +------------+------------ + SELECT $1; | $1 = 3 + SELECT $1; | $1 = 4 +(2 rows) + +-- Only trace query with sampled flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; +-- Generate queries with sampling flag on, off and no trace context +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000024-0000000000000024-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000025-0000000000000025-00'*/ SELECT 2; + ?column? +---------- + 2 +(1 row) + +SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- Check number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; + count +------- + 2 +(1 row) + +-- Check span ordering +select resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns; + resource | parameters +-----------------------------------+------------ + SET pg_tracing.sample_rate = 0.0; | + Parse | + Post Parse | + SET pg_tracing.sample_rate = 0.0; | + SELECT $1; | $1 = 1 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | +(13 rows) + diff --git a/contrib/pg_tracing/expected/select.out b/contrib/pg_tracing/expected/select.out new file mode 100644 index 0000000000..9f52cf8d2a --- /dev/null +++ b/contrib/pg_tracing/expected/select.out @@ -0,0 +1,229 @@ +-- Only trace queries with sample flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; +-- Run a simple query +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- Get top span id +SELECT span_id AS top_span_id from pg_tracing_peek_spans where parent_id=1 and name!='Parse' \gset +-- Check parameters +SELECT parameters from pg_tracing_peek_spans where span_id=:top_span_id; + parameters +------------ + $1 = 1 +(1 row) + +-- Check the number of children +SELECT count(*) from pg_tracing_peek_spans where parent_id=:'top_span_id'; + count +------- + 7 +(1 row) + +-- Check resource and query id +SELECT name, resource, query_id from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, resource; + name | resource | query_id +------------+------------+--------------------- + Parse | Parse | 6865378226349601843 + Select | SELECT $1; | 6865378226349601843 + Post Parse | Post Parse | 6865378226349601843 + Planner | Planner | 6865378226349601843 + Executor | Start | 6865378226349601843 + Executor | Run | 6865378226349601843 + Result | Result | 6865378226349601843 + Executor | Finish | 6865378226349601843 + Executor | End | 6865378226349601843 +(9 rows) + +-- Check reported number of trace +SELECT traces from pg_tracing_info; + traces +-------- + 15 +(1 row) + +-- Trace a statement with function call +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ SELECT count(*) from current_database(); + count +------- + 1 +(1 row) + +-- Check the generated span name, resource and order of function call +SELECT name, resource from pg_tracing_consume_spans where trace_id=3 order by resource; + name | resource +--------------+------------------------------------------ + Aggregate | Aggregate + Executor | End + Executor | Finish + FunctionScan | FunctionScan on current_database + Parse | Parse + Planner | Planner + Post Parse | Post Parse + Executor | Run + Select | SELECT count(*) from current_database(); + Executor | Start +(10 rows) + +-- Trace a more complex query with multiple function calls +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000004-0000000000000004-01'*/ SELECT s.relation_size + s.index_size +FROM (SELECT + pg_relation_size(C.oid) as relation_size, + pg_indexes_size(C.oid) as index_size + FROM pg_class C) as s limit 1; + ?column? +---------- + 0 +(1 row) + +-- Check the generated span name, resource and order of query with multiple function calls +SELECT name, resource from pg_tracing_consume_spans where trace_id=4 order by resource; + name | resource +--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------ + Executor | End + Executor | Finish + Limit | Limit + Parse | Parse + Planner | Planner + Post Parse | Post Parse + Executor | Run + Select | SELECT s.relation_size + s.index_sizeFROM (SELECT pg_relation_size(C.oid) as relation_size, pg_indexes_size(C.oid) as index_size FROM pg_class C) as s limit $1; + SeqScan | SeqScan on pg_class c + Executor | Start + SubqueryScan | SubqueryScan on s +(11 rows) + +-- Check that we're in a correct state after a timeout +set statement_timeout=200; +-- Trace query triggering a statement timeout +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000007-0000000000000007-01'*/ select * from pg_sleep(10); +ERROR: canceling statement due to statement timeout +-- Trace a working query after the timeout to check we're in a consistent state +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000008-0000000000000008-01'*/ select 1; + ?column? +---------- + 1 +(1 row) + +-- Check the spans order and error code +SELECT name, resource, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + name | resource | sql_error_code +--------------+-----------------------------+---------------- + Parse | Parse | 00000 + Select | select * from pg_sleep($1); | 57014 + Post Parse | Post Parse | 00000 + Planner | Planner | 00000 + Executor | Start | 00000 + Executor | Run | 57014 + FunctionScan | FunctionScan on pg_sleep | 57014 + Parse | Parse | 00000 + Select | select $1; | 00000 + Post Parse | Post Parse | 00000 + Planner | Planner | 00000 + Executor | Start | 00000 + Executor | Run | 00000 + Result | Result | 00000 + Executor | Finish | 00000 + Executor | End | 00000 +(16 rows) + +-- Cleanup statement setting +set statement_timeout=0; +-- Create a prepare statement with comment passed as first parameter +PREPARE test_prepared (text, integer) AS /*$1*/ SELECT $2; +-- Execute prepare statement with trace context passed as a parameter +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', 1); + ?column? +---------- + 1 +(1 row) + +-- Check generated spans and order +SELECT trace_id, name, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + trace_id | name | resource | parameters +----------+----------+-----------------------------------------------------+---------------------------------------------------------------------------------------------------------------- + 9 | Select | PREPARE test_prepared (text, integer) AS SELECT $2; | $1 = 'dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', $2 = '1' + 9 | Planner | Planner | + 9 | Executor | Start | + 9 | Executor | Run | + 9 | Result | Result | + 9 | Executor | Finish | + 9 | Executor | End | +(7 rows) + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +-- Execute prepare statement with trace context passed as a parameter and generic plan +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000010-0000000000000010-01''', 10); + ?column? +---------- + 10 +(1 row) + +-- Check spans are generated even through generic plan +SELECT trace_id, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + trace_id | resource | parameters +----------+-----------------------------------------------------+------------ + 16 | PREPARE test_prepared (text, integer) AS SELECT $2; | + 16 | Start | + 16 | Run | + 16 | Result | + 16 | Finish | + 16 | End | +(6 rows) + +-- Run a statement with node not executed +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000a-000000000000000a-01'*/ select 1 limit 0; + ?column? +---------- +(0 rows) + +-- Not executed node should not generate any spans +SELECT trace_id, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + trace_id | resource | parameters +----------+---------------------+---------------- + 10 | Parse | + 10 | select $1 limit $2; | $1 = 1, $2 = 0 + 10 | Post Parse | + 10 | Planner | + 10 | Start | + 10 | Run | + 10 | Limit | + 10 | Finish | + 10 | End | +(9 rows) + +-- Test multiple statements in a single query +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000011-0000000000000012-01'*/ select 1; select 2; + ?column? +---------- + 1 +(1 row) + + ?column? +---------- + 2 +(1 row) + +-- Not executed node should not generate any spans +select resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns; + resource | parameters +------------+------------ + select $1; | $1 = 1 + Parse | + Post Parse | + Planner | + Start | + Run | + Result | + Finish | + End | +(9 rows) + +-- Cleanup +SET plan_cache_mode='auto'; +DEALLOCATE test_prepared; diff --git a/contrib/pg_tracing/expected/subxact.out b/contrib/pg_tracing/expected/subxact.out new file mode 100644 index 0000000000..698778e155 --- /dev/null +++ b/contrib/pg_tracing/expected/subxact.out @@ -0,0 +1,75 @@ +-- Create test table +CREATE TABLE IF NOT EXISTS pg_tracing_test (a int, b char(20)); +-- Enable full sampling +SET pg_tracing.sample_rate = 1.0; +-- Start a transaction with subxaction +BEGIN; +SAVEPOINT s1; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s2; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s3; +SELECT 1; + ?column? +---------- + 1 +(1 row) + +COMMIT; +-- Check that subxact_count is correctly reported +select resource, parameters, subxact_count from pg_tracing_consume_spans order by span_start, span_start_ns; + resource | parameters | subxact_count +------------------------------------------------------------------+----------------------------+--------------- + BEGIN; | | 0 + Parse | | 0 + Post Parse | | 0 + BEGIN; | | 0 + SAVEPOINT $1; | $1 = s1 | 0 + Parse | | 0 + Post Parse | | 0 + SAVEPOINT s1; | | 0 + INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | $1 = 1, $2 = 2, $3 = 'aaa' | 0 + Parse | | 0 + Post Parse | | 0 + Planner | | 0 + Start | | 0 + Run | | 0 + Insert on pg_tracing_test | | 1 + ProjectSet | | 1 + Result | | 1 + Finish | | 1 + End | | 1 + SAVEPOINT $1; | $1 = s2 | 1 + Parse | | 1 + Post Parse | | 1 + SAVEPOINT s2; | | 1 + INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | $1 = 1, $2 = 2, $3 = 'aaa' | 1 + Parse | | 1 + Post Parse | | 1 + Planner | | 1 + Start | | 1 + Run | | 1 + Insert on pg_tracing_test | | 2 + ProjectSet | | 2 + Result | | 2 + Finish | | 2 + End | | 2 + SAVEPOINT $1; | $1 = s3 | 2 + Parse | | 2 + Post Parse | | 2 + SAVEPOINT s3; | | 2 + SELECT $1; | $1 = 1 | 2 + Parse | | 2 + Post Parse | | 2 + Planner | | 2 + Start | | 2 + Run | | 2 + Result | | 2 + Finish | | 2 + End | | 2 + COMMIT; | | 2 + Parse | | 2 + Post Parse | | 2 + COMMIT; | | 2 +(51 rows) + diff --git a/contrib/pg_tracing/expected/utility.out b/contrib/pg_tracing/expected/utility.out new file mode 100644 index 0000000000..3503c16865 --- /dev/null +++ b/contrib/pg_tracing/expected/utility.out @@ -0,0 +1,214 @@ +-- Some utility functions +CREATE OR REPLACE FUNCTION get_span_start_ns(time_start timestamptz, start_ns smallint) RETURNS bigint AS +$BODY$ +BEGIN + RETURN extract(epoch from time_start) * 1000000000 + start_ns; +END; +$BODY$ +LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION get_span_end_ns(time_start timestamptz, start_ns smallint, duration bigint) RETURNS bigint AS +$BODY$ +BEGIN + RETURN get_span_start_ns(time_start, start_ns) + duration; +END; +$BODY$ +LANGUAGE plpgsql; +-- Create pg_tracing extension with sampling on +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; +-- This will create the following spans (non exhaustive list): +-- +-- +-------------------------------------------------------------------------------------+ +-- | A: Utility: Create Extension | +-- +-+-----------------------------------------------------------------------------------+ +-- +----------------------------------------------------------------------------------+ +-- |B: ProcessUtility: Create Extension | +-- +---+-----------------------------------+---+--------------------------------------+ +-- +-----------------------------------+ +-------------------------------------+ +-- |C: Utility: Create Function1 | |E: Utility: Create Function2 | +-- ++----------------------------------+ ++-----------------------------------++ +-- +----------------------------------+ +-----------------------------------+ +-- |D: ProcessUtility: Create Function| |F: ProcessUtility: Create Function2| +-- +----------------------------------+ +-----------------------------------+ +-- Extract span_ids, start and end of those spans +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=1 and name='Utility' \gset +SELECT span_id AS span_b_id, + get_span_start_ns(span_start, span_start_ns) as span_b_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_b_end + from pg_tracing_peek_spans where parent_id=:span_a_id and name='ProcessUtility' \gset +SELECT span_id AS span_c_id, + get_span_start_ns(span_start, span_start_ns) as span_c_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_c_end + from pg_tracing_peek_spans where parent_id=:span_b_id and name='Utility' limit 1 \gset +SELECT span_id AS span_d_id, + get_span_start_ns(span_start, span_start_ns) as span_d_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_d_end + from pg_tracing_peek_spans where parent_id=:span_c_id and name='ProcessUtility' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_b_id and name='Utility' limit 1 offset 1 \gset +-- Check that the start and end of those spans are within expectation +SELECT :span_a_start < :span_b_start AS span_a_start_first, + :span_a_end >= :span_b_end AS span_a_end_last, + :span_d_end <= :span_c_end AS nested_span_ends_before_parent, + :span_c_end <= :span_e_start AS next_utility_start_after; + span_a_start_first | span_a_end_last | nested_span_ends_before_parent | next_utility_start_after +--------------------+-----------------+--------------------------------+-------------------------- + t | t | t | t +(1 row) + +-- Clean current spans +select count(*) from pg_tracing_consume_spans; + count +------- + 49 +(1 row) + +-- +-- Test that no utility is captured with track_utility off +-- +-- Set utility off +SET pg_tracing.track_utility = off; +-- Prepare and execute a prepared statement +PREPARE test_prepared_one_param (integer) AS SELECT $1; +EXECUTE test_prepared_one_param(100); + ?column? +---------- + 100 +(1 row) + +-- Nothing should be generated +select count(*) from pg_tracing_consume_spans; + count +------- + 0 +(1 row) + +-- Force a query to start from ExecutorRun +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + ?column? +---------- + 200 +(1 row) + +-- Again, nothing should be generated +select count(*) from pg_tracing_consume_spans; + count +------- + 0 +(1 row) + +-- +-- Test that no utility is captured with track_utility off +-- +-- Enable utility tracking and track everything +SET pg_tracing.track_utility = on; +SET pg_tracing.sample_rate = 1.0; +-- Prepare and execute a prepared statement +PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; +EXECUTE test_prepared_one_param_2(100); + ?column? +---------- + 100 +(1 row) + +-- Check the number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; + count +------- + 2 +(1 row) + +-- Check that the spans are in correct order +select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; + resource | parameters +-----------------------------------------------------------+------------ + PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | + Parse | + Post Parse | + PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | + PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | + Parse | + Post Parse | + EXECUTE test_prepared_one_param_2(100); | + Parse | + Post Parse | + EXECUTE test_prepared_one_param_2(100); | + PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | + Planner | + Start | + Run | + Result | + Finish | + End | +(18 rows) + +-- Check the top span (standalone top span has trace_id=parent_id=span_id) +select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns; + resource | parameters +-----------------------------------------------------------+------------ + PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | + EXECUTE test_prepared_one_param_2(100); | +(2 rows) + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + ?column? +---------- + 200 +(1 row) + +-- Check the number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; + count +------- + 2 +(1 row) + +-- Check that the spans are in correct order +select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; + resource | parameters +---------------------------------------------------------+------------ + SET plan_cache_mode='force_generic_plan'; | + Parse | + Post Parse | + SET plan_cache_mode='force_generic_plan'; | + EXECUTE test_prepared_one_param(200); | + Parse | + Post Parse | + EXECUTE test_prepared_one_param(200); | + PREPARE test_prepared_one_param (integer) AS SELECT $1; | + Start | + Run | + Result | + Finish | + End | +(14 rows) + +-- Check the top span (standalone top span has trace_id=parent_id=span_id) +select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns; + resource | parameters +-------------------------------------------+------------ + SET plan_cache_mode='force_generic_plan'; | + EXECUTE test_prepared_one_param(200); | +(2 rows) + +-- Second create extension should generate an error that is captured by span +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; +ERROR: extension "pg_tracing" already exists +select resource, parameters, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns; + resource | parameters | sql_error_code +------------------------------+------------+---------------- + CREATE EXTENSION pg_tracing; | | 42710 + Parse | | 00000 + Post Parse | | 00000 + CREATE EXTENSION pg_tracing; | | 42710 +(4 rows) + +-- Cleanup +SET plan_cache_mode='auto'; diff --git a/contrib/pg_tracing/expected/wal.out b/contrib/pg_tracing/expected/wal.out new file mode 100644 index 0000000000..9a22043ea3 --- /dev/null +++ b/contrib/pg_tracing/expected/wal.out @@ -0,0 +1,96 @@ +-- Create test table +CREATE TABLE IF NOT EXISTS pg_tracing_test (a int, b char(20)); +NOTICE: relation "pg_tracing_test" already exists, skipping +-- Enable wal instrumentation +set pg_tracing.instrument_wal = true; +-- Generate queries with wal write +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; +-- Check WAL is generated for the above statements +SELECT trace_id, name, resource, + wal_records > 0 as wal_records, + wal_bytes > 0 as wal_bytes +FROM pg_tracing_consume_spans order by span_start, span_start_ns, resource; + trace_id | name | resource | wal_records | wal_bytes +----------+------------+------------------------------------------------------------------+-------------+----------- + 1 | Insert | INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | t | t + 1 | Parse | Parse | | + 1 | Post Parse | Post Parse | | + 1 | Planner | Planner | f | f + 1 | Executor | Start | | + 1 | Executor | Run | | + 1 | Insert | Insert on pg_tracing_test | t | t + 1 | ProjectSet | ProjectSet | f | f + 1 | Result | Result | f | f + 1 | Executor | Finish | | + 1 | Executor | End | | + 2 | Parse | Parse | | + 2 | Update | UPDATE pg_tracing_test SET b = $1 WHERE a > $2; | t | t + 2 | Post Parse | Post Parse | | + 2 | Planner | Planner | f | f + 2 | Executor | Start | | + 2 | Executor | Run | | + 2 | Update | Update on pg_tracing_test | t | t + 2 | SeqScan | SeqScan on pg_tracing_test|Filter : (a > 7) | f | f + 2 | Executor | Finish | | + 2 | Executor | End | | + 3 | Delete | DELETE FROM pg_tracing_test WHERE a > $1; | t | t + 3 | Parse | Parse | | + 3 | Post Parse | Post Parse | | + 3 | Planner | Planner | f | f + 3 | Executor | Start | | + 3 | Executor | Run | | + 3 | Delete | Delete on pg_tracing_test | t | t + 3 | SeqScan | SeqScan on pg_tracing_test|Filter : (a > 9) | f | f + 3 | Executor | Finish | | + 3 | Executor | End | | +(31 rows) + +-- Redo the same but without wal instrumentation +set pg_tracing.instrument_wal = false; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; +-- With wal instrumentation disabled, wal counters should be set to 0 +SELECT trace_id, name, resource, + wal_records = 0 as wal_records, + wal_bytes = 0 as wal_bytes +FROM pg_tracing_consume_spans order by span_start, span_start_ns, resource; + trace_id | name | resource | wal_records | wal_bytes +----------+------------+------------------------------------------------------------------+-------------+----------- + 1 | Insert | INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | t | t + 1 | Parse | Parse | | + 1 | Post Parse | Post Parse | | + 1 | Planner | Planner | t | t + 1 | Executor | Start | | + 1 | Executor | Run | | + 1 | Insert | Insert on pg_tracing_test | t | t + 1 | ProjectSet | ProjectSet | t | t + 1 | Result | Result | t | t + 1 | Executor | Finish | | + 1 | Executor | End | | + 2 | Parse | Parse | | + 2 | Update | UPDATE pg_tracing_test SET b = $1 WHERE a > $2; | t | t + 2 | Post Parse | Post Parse | | + 2 | Planner | Planner | t | t + 2 | Executor | Start | | + 2 | Executor | Run | | + 2 | Update | Update on pg_tracing_test | t | t + 2 | SeqScan | SeqScan on pg_tracing_test|Filter : (a > 7) | t | t + 2 | Executor | Finish | | + 2 | Executor | End | | + 3 | Delete | DELETE FROM pg_tracing_test WHERE a > $1; | t | t + 3 | Parse | Parse | | + 3 | Post Parse | Post Parse | | + 3 | Planner | Planner | t | t + 3 | Executor | Start | | + 3 | Executor | Run | | + 3 | Delete | Delete on pg_tracing_test | t | t + 3 | SeqScan | SeqScan on pg_tracing_test|Filter : (a > 9) | t | t + 3 | Executor | Finish | | + 3 | Executor | End | | +(31 rows) + +-- Cleanup +set pg_tracing.instrument_wal = true; diff --git a/contrib/pg_tracing/meson.build b/contrib/pg_tracing/meson.build new file mode 100644 index 0000000000..1d1ae13a43 --- /dev/null +++ b/contrib/pg_tracing/meson.build @@ -0,0 +1,52 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +pg_tracing_sources = files( + 'pg_tracing.c', + 'query_process.c', + 'pg_tracing_explain.c', + 'span.c', +) + +if host_system == 'windows' + pg_tracing_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_tracing', + '--FILEDESC', 'pg_tracing - Add support for distributed tracing',]) +endif + +pg_tracing = shared_module('pg_tracing', + pg_tracing_sources, + kwargs: contrib_mod_args + { + 'dependencies': contrib_mod_args['dependencies'], + }, +) +contrib_targets += pg_tracing + +install_data( + 'pg_tracing.control', + 'pg_tracing--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_tracing', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'utility', + 'select', + 'sample', + 'subxact', + 'reset', + 'nested', + 'wal', + 'parallel', + 'cleanup' + ], + 'regress_args': ['--temp-config', files('pg_tracing.conf')], + # Disabled because these tests require + # "shared_preload_libraries=pg_tracing", which typical + # runningcheck users do not have (e.g. buildfarm clients). + 'runningcheck': false, + }, +} diff --git a/contrib/pg_tracing/pg_tracing--1.0.sql b/contrib/pg_tracing/pg_tracing--1.0.sql new file mode 100644 index 0000000000..544dcb71fe --- /dev/null +++ b/contrib/pg_tracing/pg_tracing--1.0.sql @@ -0,0 +1,99 @@ +/* contrib/pg_tracing/pg_tracing--1.0.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "CREATE EXTENSION pg_tracing" to load this file. \quit + +--- Define pg_tracing_info +CREATE FUNCTION pg_tracing_info( + OUT traces bigint, + OUT spans bigint, + OUT dropped_spans bigint, + OUT failed_truncates bigint, + OUT last_consume timestamp with time zone, + OUT stats_reset timestamp with time zone, + OUT sampling_rate double precision +) +RETURNS record +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT VOLATILE PARALLEL SAFE; + +CREATE FUNCTION pg_tracing_reset() +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C PARALLEL SAFE; + +CREATE FUNCTION pg_tracing_spans( + IN consume bool, + OUT trace_id bigint, + OUT parent_id bigint, + OUT span_id bigint, + OUT query_id bigint, + OUT name text, + OUT resource text, + OUT span_start timestamp with time zone, + OUT span_start_ns smallint, + OUT duration bigint, + OUT sql_error_code character(5), + OUT pid int4, + OUT nested_level smallint, + OUT subxact_count smallint, + OUT top_span bool, + +-- Plan counters + OUT startup_cost float8, + OUT total_cost float8, + OUT plan_rows float8, + OUT plan_width int, + +-- Node Counters + OUT rows int8, + OUT nloops int8, + + OUT shared_blks_hit int8, + OUT shared_blks_read int8, + OUT shared_blks_dirtied int8, + OUT shared_blks_written int8, + + OUT local_blks_hit int8, + OUT local_blks_read int8, + OUT local_blks_dirtied int8, + OUT local_blks_written int8, + + OUT blk_read_time float8, + OUT blk_write_time float8, + + OUT temp_blks_read int8, + OUT temp_blks_written int8, + OUT temp_blk_read_time float8, + OUT temp_blk_write_time float8, + + OUT wal_records int8, + OUT wal_fpi int8, + OUT wal_bytes numeric, + + OUT jit_functions int8, + OUT jit_generation_time float8, + OUT jit_inlining_time float8, + OUT jit_optimization_time float8, + OUT jit_emission_time float8, + +-- SpanNode specific data + OUT startup bigint, -- First tuple + OUT parameters text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C PARALLEL SAFE; + +CREATE VIEW pg_tracing_info AS + SELECT * FROM pg_tracing_info(); + +CREATE VIEW pg_tracing_peek_spans AS + SELECT * FROM pg_tracing_spans(false); + +CREATE VIEW pg_tracing_consume_spans AS + SELECT * FROM pg_tracing_spans(true); + +GRANT SELECT ON pg_tracing_info TO PUBLIC; +GRANT SELECT ON pg_tracing_peek_spans TO PUBLIC; +GRANT SELECT ON pg_tracing_consume_spans TO PUBLIC; diff --git a/contrib/pg_tracing/pg_tracing.c b/contrib/pg_tracing/pg_tracing.c new file mode 100644 index 0000000000..35e62abcbd --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.c @@ -0,0 +1,2313 @@ +/*------------------------------------------------------------------------- + * + * pg_tracing.c + * Generate spans for distributed tracing from SQL query + * + * Spans will only be generated for sampled queries. A query is sampled if: + * - It has a tracecontext propagated throught SQLCommenter and it passes the caller_sample_rate. + * - It has no SQLCommenter but the query randomly passes the global sample_rate + * + * A query with sqlcommenter will look like: /\*dddbs='postgres.db',traceparent='00-00000000000000000000000000000009-0000000000000005-01'*\/ select 1; + * The traceparent fields are detailed in https://www.w3.org/TR/trace-context/#traceparent-header-field-values + * 00000000000000000000000000000009: trace id + * 0000000000000005: parent id + * 01: trace flags (01 == sampled) + * + * If sampled, we will generate spans for the ongoing query. + * A span represents an operation with a start time, a duration and useful metadatas (block stats, wal stats...). + * We will track the following operations: + * - Top Span: The top span for a query. They are created after extracting the traceid from traceparent or to represent a nested query. + * - Planner: We track the time spent in the planner and report the planner counters + * - Node Span: Created from planstate. The name is extracted from the node type (IndexScan, SeqScan). + * - Executor: We trace the different steps of the Executor: Start, Run, Finish and End + * + * A typical traced query will generate the following spans: + * +------------------------------------------------------------------------------------------------------------------------------------------------------+ + * | Name: Select (top span) | + * | Operation: Select * pgbench_accounts WHERE aid=$1; | + * +---+------------------------+-+------------------+--+------------------------------------------------------+-+--------------------+-+----------------++ + * | Name: Planner | | Name: Executor | |Name: Executor | | Name: Executor | | Name: Executor | + * | Operation: Planner | | Operation: Start | |Operation: Run | | Operation: Finish | | Operation: End | + * +------------------------+ +------------------+ +--+--------------------------------------------------++ +--------------------+ +----------------+ + * | Name: IndexScan | + * | Operation: IndexScan using pgbench_accounts_pkey | + * | on pgbench_accounts | + * +--------------------------------------------------+ + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_tracing_explain.h" +#include "pg_tracing.h" +#include "query_process.h" +#include "span.h" + +#include "utils/ruleutils.h" +#include "access/xact.h" +#include "common/pg_prng.h" +#include "funcapi.h" +#include "nodes/extensible.h" +#include "optimizer/planner.h" +#include "parser/analyze.h" +#include "storage/ipc.h" +#include "tcop/utility.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/xid8.h" + +PG_MODULE_MAGIC; + +typedef enum +{ + PG_TRACING_TRACK_NONE, /* track no statements */ + PG_TRACING_TRACK_TOP, /* only top level statements */ + PG_TRACING_TRACK_ALL /* all statements, including nested ones */ +} pgTracingTrackLevel; + +/* + * Structure to store flexible array of spans + */ +typedef struct pgTracingSpans +{ + int end; /* Index of last element */ + int max; /* Maximum number of element */ + Span spans[FLEXIBLE_ARRAY_MEMBER]; +} pgTracingSpans; + +/* + * A trace context for a specific parallel context + */ +typedef struct pgTracingParallelContext +{ + BackendId leader_backend_id; /* Backend id of the leader, set to + * InvalidBackendId if unused */ + uint64 trace_id; /* Trace id of the sampled query */ + uint64 parent_id; /* Id of the parent Gather span */ + uint64 query_id; /* Query id from the parent worker */ + pgTracingStartTime start_trace; + +} pgTracingParallelContext; + +/* + * Store context for parallel workers + */ +typedef struct pgTracingParallelWorkers +{ + slock_t mutex; + pgTracingParallelContext trace_contexts[FLEXIBLE_ARRAY_MEMBER]; +} pgTracingParallelWorkers; + +/* GUC variables */ +static int pg_tracing_max_span; /* Maximum number of spans to store */ +static int pg_tracing_max_parameter_str; /* Maximum number of spans to + * store */ +static int pg_tracing_max_traced_parallel_workers; /* Maximum number of + * parallel workers traced */ +static int pg_tracing_initial_allocated_spans = 25; /* Number of spans + * allocated at the + * start of a trace at + * the same time. */ +static bool pg_tracing_instrument_buffers; /* Enable buffers instrumentation */ +static bool pg_tracing_instrument_wal; /* Enable WAL instrumentation */ +static double pg_tracing_sample_rate = 0; /* Sample rate applied to queries + * without SQLCommenter */ +static double pg_tracing_caller_sample_rate = 1; /* Sample rate applied to + * queries with + * SQLCommenter */ +static int pg_tracing_track = PG_TRACING_TRACK_ALL; /* tracking level */ +static bool pg_tracing_track_utility = true; /* whether to track utility + * commands */ +static bool pg_tracing_drop_full_buffer = true; /* whether to drop buffered + * spans when full */ + + +int64 nested_query_start_ns = 0; +int64 executor_start_start_ns = 0; + +static const struct config_enum_entry track_options[] = +{ + {"none", PG_TRACING_TRACK_NONE, false}, + {"top", PG_TRACING_TRACK_TOP, false}, + {"all", PG_TRACING_TRACK_ALL, false}, + {NULL, 0, false} +}; + +#define pg_tracing_enabled(level) \ + ((pg_tracing_track == PG_TRACING_TRACK_ALL || \ + (pg_tracing_track == PG_TRACING_TRACK_TOP && (level) == 0))) + +PG_FUNCTION_INFO_V1(pg_tracing_info); +PG_FUNCTION_INFO_V1(pg_tracing_spans); +PG_FUNCTION_INFO_V1(pg_tracing_reset); + +/* + * Global variables + */ + +/* Memory context for pg_tracing. */ +static MemoryContext pg_tracing_mem_ctx; + +/* Current trace state */ +static struct pgTracingTrace current_trace; + +/* Shared state with stats and file external state */ +static pgTracingSharedState * pg_tracing = NULL; + +/* + * Shared buffer storing spans. Query with sampled flag will add new spans to + * the shared state. Those spans will be consumed during calls to + * pg_tracing_spans + */ +static pgTracingSpans * shared_spans = NULL; + +/* + * Store all spans for the curent trace except top spans and parse spans. + * They will be added to the shared buffer at the end of query tracing. + * + * This buffer can be dynamically resized so care needs to be taken when using span pointers. + */ +static pgTracingSpans * current_trace_spans; + +/* + * Shared buffer storing trace context for parallel workers. + */ +static pgTracingParallelWorkers * pg_tracing_parallel = NULL; + +/* Index of the parallel worker context shared buffer if any */ +static int parallel_context_index = -1; + +/* +* Text for spans are buffered in this +* stringinfo and written at the end of the tracing in a single write. +*/ +static StringInfo current_trace_text; + +/* + * Maximum nested level for a query to know how many top spans we need to + * copy in shared_spans + */ +static int max_nested_level = -1; + +/* Current nesting depth of planner calls */ +static int plan_nested_level = 0; + +/* Current nesting depth of ExecutorRun+ProcessUtility calls */ +static int exec_nested_level = 0; + +/* Number of allocated levels. */ +static int allocated_nested_level = 0; + +/* + * Per nested level buffers + */ +typedef struct pgTracingPerLevelBuffer +{ + int top_span_index; /* Index of the top span in the + * current_trace_spans buffer */ + uint64 executor_id; /* Span id of executor run spans by nested + * level Executor run is used as parent for + * spans generated from planstate */ + uint64_t query_id; /* Query id by nested level when available */ +} pgTracingPerLevelBuffer; + +static pgTracingPerLevelBuffer * per_level_buffers; + +/* + * Timestamp of the latest statement checked for sampling. + */ +static TimestampTz last_statement_check_for_sampling = 0; + +static void pg_tracing_shmem_request(void); +static void pg_tracing_shmem_startup(void); + +/* Saved hook values in case of unload */ +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL; +static planner_hook_type prev_planner_hook = NULL; +static ExecutorStart_hook_type prev_ExecutorStart = NULL; +static ExecutorRun_hook_type prev_ExecutorRun = NULL; +static ExecutorFinish_hook_type prev_ExecutorFinish = NULL; +static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; +static ProcessUtility_hook_type prev_ProcessUtility = NULL; + +static void pg_tracing_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate); +static PlannedStmt *pg_tracing_planner_hook(Query *parse, + const char *query_string, + int cursorOptions, + ParamListInfo params); +static void pg_tracing_ExecutorStart(QueryDesc *queryDesc, int eflags); +static void pg_tracing_ExecutorRun(QueryDesc *queryDesc, + ScanDirection direction, + uint64 count, bool execute_once); +static void pg_tracing_ExecutorFinish(QueryDesc *queryDesc); +static void pg_tracing_ExecutorEnd(QueryDesc *queryDesc); +static void pg_tracing_ProcessUtility(PlannedStmt *pstmt, const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, QueryCompletion *qc); + +static void generate_member_nodes(PlanState **planstates, int nplans, planstateTraceContext * planstateTraceContext, uint64 parent_id); +static void generate_span_from_planstate(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id); +static pgTracingStats get_empty_pg_tracing_stats(void); + +#define PG_TRACING_INFO_COLS 7 +#define PG_TRACING_TRACES_COLS 44 + +/* + * Module load callback + */ +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + DefineCustomIntVariable("pg_tracing.max_span", + "Maximum number of spans stored in shared memory.", + NULL, + &pg_tracing_max_span, + 1000, + 0, + 500000, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_tracing.max_parameter_size", + "Maximum size of parameters. -1 to disable parameter in top span.", + NULL, + &pg_tracing_max_parameter_str, + 1024, + 0, + 10000, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.instrument_buffers", + "Instrument and add buffers usage in spans.", + NULL, + &pg_tracing_instrument_buffers, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.instrument_wal", + "Instrument and add WAL usage in spans.", + NULL, + &pg_tracing_instrument_wal, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_tracing.max_traced_parallel_workers", + "Maximum number of parallel workers traced. -1 to disable tracing of parallel workers.", + NULL, + &pg_tracing_max_traced_parallel_workers, + 100, + 0, + 10000, + PGC_POSTMASTER, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_tracing.initial_allocated_spans", + "The number of allocated spans at the start of a trace. A higher number can help avoid reallocation when extending the buffer.", + NULL, + &pg_tracing_initial_allocated_spans, + 25, + 0, + 500, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomEnumVariable("pg_tracing.track", + "Selects which statements are tracked by pg_tracing.", + NULL, + &pg_tracing_track, + PG_TRACING_TRACK_ALL, + track_options, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.track_utility", + "Selects whether utility commands are tracked by pg_tracing.", + NULL, + &pg_tracing_track_utility, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.drop_on_full_buffer", + "Selects whether buffered spans should be dropped when buffer is full.", + NULL, + &pg_tracing_drop_full_buffer, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomRealVariable("pg_tracing.sample_rate", + "Fraction of queries to process.", + NULL, + &pg_tracing_sample_rate, + 0.0, + 0.0, + 1.0, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomRealVariable("pg_tracing.caller_sample_rate", + "Fraction of queries with sampled flag to process.", + NULL, + &pg_tracing_caller_sample_rate, + 1.0, + 0.0, + 1.0, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + MarkGUCPrefixReserved("pg_tracing"); + + /* For jumble state */ + EnableQueryId(); + + /* Install hooks. */ + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = pg_tracing_shmem_request; + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = pg_tracing_shmem_startup; + + prev_post_parse_analyze_hook = post_parse_analyze_hook; + post_parse_analyze_hook = pg_tracing_post_parse_analyze; + + prev_planner_hook = planner_hook; + planner_hook = pg_tracing_planner_hook; + + prev_ExecutorStart = ExecutorStart_hook; + ExecutorStart_hook = pg_tracing_ExecutorStart; + prev_ExecutorRun = ExecutorRun_hook; + ExecutorRun_hook = pg_tracing_ExecutorRun; + prev_ExecutorFinish = ExecutorFinish_hook; + ExecutorFinish_hook = pg_tracing_ExecutorFinish; + prev_ExecutorEnd = ExecutorEnd_hook; + ExecutorEnd_hook = pg_tracing_ExecutorEnd; + + prev_ProcessUtility = ProcessUtility_hook; + ProcessUtility_hook = pg_tracing_ProcessUtility; +} + +/* + * Estimate shared memory space needed. + */ +static Size +pg_tracing_memsize(void) +{ + Size size; + + /* pg_tracing shared state */ + size = sizeof(pgTracingSharedState); + /* span struct */ + size = add_size(size, sizeof(pgTracingSpans)); + /* the span variable array */ + size = add_size(size, mul_size(pg_tracing_max_span, sizeof(Span))); + /* and the parallel workers context */ + size = add_size(size, mul_size(pg_tracing_max_traced_parallel_workers, sizeof(pgTracingParallelContext))); + + return size; +} + +/* + * shmem_request hook: request additional shared resources. We'll allocate + * or attach to the shared resources in pgss_shmem_startup(). + */ +static void +pg_tracing_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + RequestAddinShmemSpace(pg_tracing_memsize()); +} + +/* + * Push trace context to the shared parallel worker buffer + */ +static void +add_parallel_context(uint64 parent_id, uint64 query_id) +{ + volatile pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel; + + if (pg_tracing_max_traced_parallel_workers <= 0) + /* No tracing of parallel workers */ + return; + + Assert(parallel_context_index == -1); + SpinLockAcquire(&p->mutex); + for (int i = 0; i < pg_tracing_max_traced_parallel_workers; i++) + { + volatile pgTracingParallelContext *ctx = p->trace_contexts + i; + + Assert(ctx->leader_backend_id != MyBackendId); + if (ctx->leader_backend_id != InvalidBackendId) + continue; + /* Slot is available */ + parallel_context_index = i; + ctx->leader_backend_id = MyBackendId; + ctx->trace_id = current_trace.trace_id; + ctx->parent_id = parent_id; + ctx->query_id = query_id; + ctx->start_trace = current_trace.start_trace; + break; + } + SpinLockRelease(&p->mutex); +} + +/* + * Remove parallel context for the current leader from the shared memory. + */ +static void +remove_parallel_context(void) +{ + if (parallel_context_index < 0) + /* No tracing of parallel workers */ + return; + { + volatile pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel; + + SpinLockAcquire(&p->mutex); + p->trace_contexts[parallel_context_index].leader_backend_id = InvalidBackendId; + SpinLockRelease(&p->mutex); + } + parallel_context_index = -1; +} + +/* + * If we're inside a parallel worker, check if the trace context is stored in shared memory. + * If a trace context exists, it means that the query is sampled and worker tracing is enabled. + */ +static void +fetch_parallel_context(pgTracingTrace * trace) +{ + volatile pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel; + + if (pg_tracing_max_traced_parallel_workers <= 0) + { + /* No tracing of parallel workers */ + return; + } + SpinLockAcquire(&p->mutex); + for (int i = 0; i < pg_tracing_max_traced_parallel_workers; i++) + { + if (p->trace_contexts[i].leader_backend_id != ParallelLeaderBackendId) + continue; + /* Found a matching a trace context, fetch it */ + trace->trace_id = p->trace_contexts[i].trace_id; + trace->parent_id = p->trace_contexts[i].parent_id; + trace->start_trace = p->trace_contexts[i].start_trace; + trace->query_id = p->trace_contexts[i].query_id; + trace->sampled = true; + } + SpinLockRelease(&p->mutex); +} + +/* + * Return a slot from the current_trace_spans buffer. + * current_trace_spans may be extended if needed. + */ +static int +get_index_from_trace_spans(void) +{ + if (current_trace_spans->end + 1 >= current_trace_spans->max) + { + MemoryContext oldcxt; + + /* Need to extend. */ + /* TODO: Have a configurable limit? */ + current_trace_spans->max *= 2; + oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx); + current_trace_spans = repalloc(current_trace_spans, current_trace_spans->max * sizeof(Span)); + MemoryContextSwitchTo(oldcxt); + } + return current_trace_spans->end++; +} + +/* + * Get current_span from provided index + */ +static Span * +get_span_from_index(int span_index) +{ + Assert(span_index < current_trace_spans->end); + return ¤t_trace_spans->spans[span_index]; +} + +/* + * Get top span for the provided nested level. + * If top_span doesn't exist, create it. + */ +static Span * +get_top_span_for_nested_level(int nested_level) +{ + Span *top_span; + + Assert(nested_level <= max_nested_level); + if (per_level_buffers[nested_level].top_span_index == -1) + per_level_buffers[nested_level].top_span_index = get_index_from_trace_spans(); + top_span = get_span_from_index(per_level_buffers[nested_level].top_span_index); + return top_span; +} + +/* + * Set top span to the provided span index + */ +static void +set_top_span(int nested_level, int top_span_index) +{ + Assert(nested_level <= max_nested_level); + Assert(top_span_index < current_trace_spans->max); + per_level_buffers[nested_level].top_span_index = top_span_index; +} + +/* + * End the span at the provided index + */ +static void +end_span_index(int span_index, const int64 *end_span_input) +{ + Span *span; + + if (span_index < 0) + return; + span = get_span_from_index(span_index); + end_span(span, end_span_input); +} + +/* + * End top span for the provided nested level + */ +static void +end_top_span(int nested_level, int64 end_span_ns) +{ + Assert(nested_level <= max_nested_level); + Assert(per_level_buffers[nested_level].top_span_index < current_trace_spans->max); + Assert(per_level_buffers[nested_level].top_span_index != -1); + end_span_index(per_level_buffers[nested_level].top_span_index, &end_span_ns); +} + +/* + * shmem_startup hook: allocate or attach to shared memory, Also create and + * load the query-texts file, which is expected to exist (even if empty) + * while the module is enabled. + */ +static void +pg_tracing_shmem_startup(void) +{ + bool found_pg_tracing; + bool found_shared_spans; + bool found_parallel; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + /* reset in case this is a restart within the postmaster */ + pg_tracing = NULL; + + /* Create or attach to the shared memory state */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + current_trace.sampled = 0; + pg_tracing = ShmemInitStruct("PgTracing Shared", sizeof(pgTracingSharedState), &found_pg_tracing); + shared_spans = ShmemInitStruct("PgTracing Spans", + sizeof(pgTracingSpans) + pg_tracing_max_span * sizeof(Span), + &found_shared_spans); + pg_tracing_parallel = ShmemInitStruct("PgTracing Parallel Workers Context", + sizeof(pgTracingParallelWorkers) + pg_tracing_max_traced_parallel_workers * sizeof(pgTracingParallelContext), + &found_parallel); + + /* Initialize pg_tracing memory context */ + pg_tracing_mem_ctx = AllocSetContextCreate(TopMemoryContext, + "pg_tracing memory context", + ALLOCSET_DEFAULT_SIZES); + + /* First time, let's init shared state */ + if (!found_pg_tracing) + { + pg_tracing->stats = get_empty_pg_tracing_stats(); + SpinLockInit(&pg_tracing->mutex); + SpinLockInit(&pg_tracing_parallel->mutex); + } + if (!found_shared_spans) + { + shared_spans->end = 0; + shared_spans->max = pg_tracing_max_span; + } + if (!found_parallel) + { + for (int i = 0; i < pg_tracing_max_traced_parallel_workers; i++) + pg_tracing_parallel->trace_contexts[i].leader_backend_id = InvalidBackendId; + } + LWLockRelease(AddinShmemInitLock); +} + +/* + * Add span to the shared memory. Mutex must be acquired beforehand. + */ +static void +add_span_to_shared_buffer_locked(const Span * span) +{ + /* Spans must be ended before adding them to the shared buffer */ + Assert(span->ended); + if (shared_spans->end + 1 >= shared_spans->max) + pg_tracing->stats.dropped_spans++; + else + { + pg_tracing->stats.spans++; + shared_spans->spans[shared_spans->end++] = *span; + } +} + +/* + * Check if we still have available space in the shared spans. + * + * Between the moment we check and the moment we want to insert, the buffer + * may be full but we redo a check before appending the span. This is done + * early when starting a top span to bail out early if the buffer is + * already full since we don't immediately add the span in the shared buffer. + */ +static bool +check_full_shared_spans() +{ + if (shared_spans->end + 1 >= shared_spans->max) + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + bool full_buffer = false; + + SpinLockAcquire(&s->mutex); + if (pg_tracing_drop_full_buffer) + { + s->stats.dropped_spans += shared_spans->end; + shared_spans->end = 0; + s->extent = 0; + } + else + { + full_buffer = true; + s->stats.dropped_spans++; + } + SpinLockRelease(&s->mutex); + return full_buffer; + } + return false; +} + +/* + * Process a query descriptor: Gather all query instrumentation in the query + * span counters and generate span nodes from queryDesc planstate + */ +static void +process_query_desc(QueryDesc *queryDesc, int sql_error_code) +{ + NodeCounters *node_counters = &get_top_span_for_nested_level(exec_nested_level)->node_counters; + + /* Process total counters */ + if (queryDesc->totaltime) + { + InstrEndLoop(queryDesc->totaltime); + node_counters->buffer_usage = queryDesc->totaltime->bufusage; + node_counters->wal_usage = queryDesc->totaltime->walusage; + } + /* Process JIT counter */ + if (queryDesc->estate->es_jit) + node_counters->jit_usage = queryDesc->estate->es_jit->instr; + node_counters->rows = queryDesc->estate->es_total_processed; + + /* Process planstate */ + if (queryDesc->planstate) + { + Bitmapset *rels_used = NULL; + planstateTraceContext planstateTraceContext; + uint64 parent_id = per_level_buffers[exec_nested_level].executor_id; + + /* Prepare the planstate context for deparsing */ + planstateTraceContext.rtable_names = select_rtable_names_for_explain(queryDesc->plannedstmt->rtable, rels_used); + planstateTraceContext.trace = ¤t_trace; + planstateTraceContext.ancestors = NULL; + planstateTraceContext.sql_error_code = sql_error_code; + planstateTraceContext.deparse_ctx = deparse_context_for_plan_tree(queryDesc->plannedstmt, planstateTraceContext.rtable_names); + + generate_span_from_planstate(queryDesc->planstate, &planstateTraceContext, parent_id); + } +} + +/* + * If trace was started from the global sample rate without + * a parent trace, we need to generate a random trace id + */ +static void +set_traceparent_trace_id(struct pgTracingTrace *traceparent_parameter) +{ + if (traceparent_parameter->trace_id != 0) + return; + + /* + * We leave parent_id to 0 as a way to indicate that this is a standalone + * trace. The top span created with a parent_id == 0 will have trace_id == + * parent_id == span_id. + */ + Assert(traceparent_parameter->parent_id == 0); + traceparent_parameter->trace_id = pg_prng_uint64(&pg_global_prng_state); +} + +/* + * Decide whether a query should be sampled depending on the traceparent sampled + * and the provided sample rate configurations + */ +static bool +apply_sample_rate(struct pgTracingTrace const *traceparent_parameter) +{ + double rand; + bool sampled; + + /* Everything is sampled */ + if (pg_tracing_sample_rate >= 1.0) + return true; + + /* No SQLCommenter sampled and no global sample rate */ + if (traceparent_parameter->sampled == 0 && pg_tracing_sample_rate == 0.0) + return false; + + /* SQLCommenter sampled caller sample */ + if (traceparent_parameter->sampled == 1 && pg_tracing_caller_sample_rate >= 1.0) + return true; + + /* + * We have a non zero global sample rate or a sampled flag. Either way, we + * need a rand value + */ + rand = pg_prng_double(&pg_global_prng_state); + if (traceparent_parameter->sampled == 1) + /* Sampled flag case */ + sampled = (rand < pg_tracing_caller_sample_rate); + else + /* Global rate case */ + sampled = (rand < pg_tracing_sample_rate); + return sampled; +} + +/* + * If the query was started as a prepared statement, we won't be able to + * extract traceparent during query parsing since parsing was skipped. + * + * We assume that SQLCommenter content can be passed as a text in the + * first parameter. + */ +static void +check_traceparameter_in_parameter(ParamListInfo params) +{ + Oid typoutput; + bool typisvarlena; + char *pstring; + ParamExternData param; + + if (params == NULL || params->numParams == 0) + return; + + param = params->params[0]; + /* Check first parameter type */ + if (param.ptype != TEXTOID) + return; + + /* Get the text of the first string parameter */ + getTypeOutputInfo(param.ptype, &typoutput, &typisvarlena); + pstring = OidOutputFunctionCall(typoutput, param.value); + extract_traceparent(¤t_trace, pstring, true); +} + +/* + * Check whether the current query should be sampled or not + */ +static void +check_query_sampled(ParseState *pstate, const ParamListInfo params) +{ + TimestampTz statement_start_ts; + + /* Safety check... */ + if (!pg_tracing || !pg_tracing_enabled(exec_nested_level)) + return; + + /* sampling already started */ + if (current_trace.sampled > 0) + return; + + /* Don't start tracing if we're not at the top */ + if (exec_nested_level > 0) + return; + + /* + * In a parallel worker, check the parallel context shared buffer to see + * if the leader left a trace context + */ + if (IsParallelWorker()) + { + fetch_parallel_context(¤t_trace); + return; + } + + Assert(max_nested_level = -1); + + current_trace.trace_id = 0; + current_trace.parent_id = 0; + if (params != NULL) + check_traceparameter_in_parameter(params); + else if (pstate != NULL) + extract_traceparent(¤t_trace, pstate->p_sourcetext, false); + + /* + * A statement can go through this function 3 times: post parsing, planner + * and executor run. If no sampling flag was extracted from SQLCommenter + * and we've already seen this statement (using statement_start ts), don't + * try to apply sample rate and exit. + */ + statement_start_ts = GetCurrentStatementStartTimestamp(); + if (current_trace.sampled == 0 && last_statement_check_for_sampling == statement_start_ts) + return; + /* First time we see this statement, save the time */ + last_statement_check_for_sampling = statement_start_ts; + + current_trace.sampled = apply_sample_rate(¤t_trace); + + if (current_trace.sampled > 0 && check_full_shared_spans()) + /* Buffer is full, abort sampling */ + current_trace.sampled = 0; + + if (current_trace.sampled > 0) + set_traceparent_trace_id(¤t_trace); +} + +/* + * Adjust span's offsets with the provided file offset + */ +static void +adjust_file_offset(Span * span, Size file_position) +{ + if (span->name_offset != -1) + span->name_offset += file_position; + if (span->operation_name_offset != -1) + span->operation_name_offset += file_position; + if (span->parameter_offset != -1) + span->parameter_offset += file_position; +} + +/* + * Get the start time for a new top span + */ +static int64 +get_top_span_start(int64 *start_time_ns) +{ + if (exec_nested_level == 0) + { + /* For the top span of parallel worker, we get the current ns */ + if (IsParallelWorker()) + { + if (start_time_ns != NULL) + return *start_time_ns; + return get_current_ns(); + } + /* For the root top span, use the trace start */ + Assert(current_trace.start_trace.ns > 0); + return current_trace.start_trace.ns; + } + + /* + * We're in a nested query, if nested_query_start is not set, we haven't + * gone through ProcessUtility. Set the start at the current time in this + * case. + */ + if (nested_query_start_ns == 0) + { + if (start_time_ns != NULL) + return *start_time_ns; + return get_current_ns(); + } + + /* + * nested_query_start is available, use it + */ + return nested_query_start_ns; +} + +/* + * Reset pg_tracing memory context and global state. + */ +static void +cleanup_tracing(void) +{ + if (current_trace.sampled == 0) + /* No need for cleaning */ + return; + remove_parallel_context(); + MemoryContextReset(pg_tracing_mem_ctx); + current_trace.sampled = 0; + current_trace.trace_id = 0; + current_trace.parent_id = 0; + max_nested_level = -1; + current_trace_spans = NULL; + per_level_buffers = NULL; +} + +/* + * End the query tracing and dump all spans in the shared buffer. + * This may happen either when query is finished or on a caught error. + */ +static void +end_tracing(const int64 *end_span_ns) +{ + Size file_position = 0; + + /* We're still a nested query, tracing is not finished */ + if (exec_nested_level > 0) + return; + + /* Dump all buffered texts in file */ + text_store_file(pg_tracing, current_trace_text->data, current_trace_text->len, &file_position); + + for (int i = 0; i < current_trace_spans->end; i++) + adjust_file_offset(current_trace_spans->spans + i, file_position); + + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + /* We're at the end, add all spans to the shared memory */ + for (int i = 0; i < current_trace_spans->end; i++) + add_span_to_shared_buffer_locked(¤t_trace_spans->spans[i]); + SpinLockRelease(&s->mutex); + } + + /* We can reset the memory context here */ + cleanup_tracing(); +} + +/* + * When we catch an error (timeout, cancel query), we need to flag the ongoing + * span with an error, send current spans in the shared buffer and clean + * our memory context + */ +static void +handle_pg_error(int span_index, QueryDesc *queryDesc) +{ + Span *span; + Span *top_span; + int64 end_span_ns; + int sql_error_code; + + /* If we're not sampling the query, bail out */ + if (current_trace.sampled == 0 && !pg_tracing_enabled(exec_nested_level)) + return; + sql_error_code = geterrcode(); + + /* + * Order matters there. We want to process query desc before getting the + * end time. Otherwise, the span nodes will have a higher duration than + * their parents. + */ + if (queryDesc != NULL) + process_query_desc(queryDesc, sql_error_code); + + /* Assign the error code to the latest span */ + span = get_span_from_index(span_index); + span->sql_error_code = sql_error_code; + + /* Assign the error code to the latest top span */ + top_span = get_top_span_for_nested_level(exec_nested_level); + top_span->sql_error_code = sql_error_code; + + end_span_ns = get_current_ns(); + /* End current span, ongoing top spans and tracing */ + for (int i = 0; i <= exec_nested_level; i++) + end_top_span(i, end_span_ns); + end_span(span, &end_span_ns); + end_tracing(&end_span_ns); +} + +/* + * Get the parent id for the given nested level + * + * nested_level can be negative when creating the root top span. + * In this case, we use the parent id from the propagated trace context. + */ +static uint64 +get_parent_id(int nested_level) +{ + Span *top_span; + + if (nested_level < 0) + return current_trace.parent_id; + Assert(nested_level <= allocated_nested_level); + top_span = get_top_span_for_nested_level(nested_level); + Assert(top_span->span_id > 0); + return top_span->span_id; +} + +/* + * If we enter a new nested level, initialize all necessary buffers + * and start_trace timer. + * + * The start of a top span can vary: prepared statement will skip parsing, the use of cached + * plans will skip the planner hook. + * Thus, a top span can start in either post parse, planner hook or executor run. + * + * Since this is called after the we've detected the start of a trace, we check for available + * space in the buffer. If the buffer is full, we abort tracing by setting sampled to false. + * Callers need to check that tracing was aborted. + */ +static bool +initialize_trace_level(void) +{ + /* Check if we've already created a top span for this nested level */ + if (exec_nested_level <= max_nested_level) + return false; + + /* First time */ + if (max_nested_level == -1) + { + MemoryContext oldcxt; + + Assert(pg_tracing_mem_ctx->isReset); + + /* + * We need to be able to pass information that depends on the nested + * level. + * + * executor span ids: Since an executor run becomes the parent span, + * we need subsequent created node spans to have the correct parent. + * + * query ids: used to propagate queryId to all spans in the same level + * + * top spans: We create one top span per nested level and those are + * only inserted in the shared buffer at the end + */ + oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx); + /* initial allocation */ + allocated_nested_level = 1; + per_level_buffers = palloc0(allocated_nested_level * sizeof(pgTracingPerLevelBuffer)); + /* 0 is a valid index, initialise all indexes to -1 */ + for (int i = 0; i < allocated_nested_level; i++) + per_level_buffers[i].top_span_index = -1; + current_trace_spans = palloc0(pg_tracing_initial_allocated_spans * sizeof(Span)); + current_trace_spans->max = pg_tracing_initial_allocated_spans; + current_trace_text = makeStringInfo(); + + MemoryContextSwitchTo(oldcxt); + + /* + * Parallel workers will get the referential timers from the leader + * parallel context + */ + if (!IsParallelWorker()) + { + /* Start referential timers */ + current_trace.start_trace.ts = GetCurrentStatementStartTimestamp(); + + /* + * We can't get the monotonic clock at the start of the statement + * so let's estimate it here. + */ + current_trace.start_trace.ns = get_current_ns() - (GetCurrentTimestamp() - current_trace.start_trace.ts) * NS_PER_US; + } + } + else if (exec_nested_level >= allocated_nested_level) + { + /* New nested level, allocate more memory */ + MemoryContext oldcxt; + int old_allocated_nested_level = allocated_nested_level; + + oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx); + allocated_nested_level++; + per_level_buffers = repalloc0(per_level_buffers, old_allocated_nested_level * sizeof(pgTracingPerLevelBuffer), allocated_nested_level * sizeof(pgTracingPerLevelBuffer)); + for (int i = old_allocated_nested_level; i < allocated_nested_level; i++) + per_level_buffers[i].top_span_index = -1; + MemoryContextSwitchTo(oldcxt); + } + + max_nested_level = exec_nested_level; + if (exec_nested_level == 0) + { + /* Start of a new trace */ + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->stats.traces++; + SpinLockRelease(&s->mutex); + } + return true; +} + +/* + * Add a new string to the current_trace_text stringinfo + */ +static int +add_str_to_trace_buffer(const char *str, int str_len) +{ + int position = current_trace_text->cursor; + + appendBinaryStringInfo(current_trace_text, str, str_len); + appendStringInfoChar(current_trace_text, '\0'); + current_trace_text->cursor = current_trace_text->len; + return position; +} + +/* + * Add the worker name to the current_trace_text stringinfo + */ +static int +add_worker_name_to_trace_buffer() +{ + int position = current_trace_text->cursor; + + appendStringInfo(current_trace_text, "Worker %d", ParallelWorkerNumber); + appendStringInfoChar(current_trace_text, '\0'); + current_trace_text->cursor = current_trace_text->len; + return position; +} + +/* + * Start a new top span if we've entered a new nested level + */ +static void +begin_top_span(Span * top_span, CmdType commandType, + const Query *query, const JumbleState *jstate, + const char *query_text, int64 start_time_ns) +{ + int query_len; + const char *normalised_query; + + /* in case of a cached plan, query might be unavailable */ + if (query != NULL) + per_level_buffers[exec_nested_level].query_id = query->queryId; + else if (current_trace.query_id > 0) + per_level_buffers[exec_nested_level].query_id = current_trace.query_id; + + begin_span(top_span, command_type_to_span_type(commandType), + ¤t_trace, get_parent_id(exec_nested_level - 1), per_level_buffers[exec_nested_level].query_id, &start_time_ns, exec_nested_level); + top_span->is_top_span = true; + + if (IsParallelWorker()) + { + /* + * In a parallel worker, we use the worker name as the span's + * operation + */ + top_span->operation_name_offset = add_worker_name_to_trace_buffer(); + return; + } + + if (jstate && jstate->clocations_count > 0 && query != NULL) + { + /* jstate is available, normalise query and extract parameters' values */ + char *paramStr; + + query_len = query->stmt_len; + normalised_query = normalise_query_parameters(jstate, query_text, + query->stmt_location, + &query_len, ¶mStr); + top_span->parameter_offset = add_str_to_trace_buffer(paramStr, strlen(paramStr)); + } + else + { + /* + * No jstate available, normalise query but we won't be able to + * extract parameters + */ + int stmt_location; + + if (query != NULL && query->stmt_len > 0) + { + query_len = query->stmt_len; + stmt_location = query->stmt_location; + } + else + { + query_len = strlen(query_text); + stmt_location = 0; + } + normalised_query = normalise_query(query_text, stmt_location, &query_len); + } + top_span->operation_name_offset = add_str_to_trace_buffer(normalised_query, query_len); +} + +/* + * Initialise buffer if we are in a new nested level and start top span. + * If the top span already exists for the current nested level, this has no effect. + */ +static int64 +initialize_trace_level_and_top_span(CmdType commandType, Query *query, JumbleState *jstate, const char *query_text, int64 *start_time_ns) +{ + bool new_nested_level = initialize_trace_level(); + int64 top_span_start_ns = get_top_span_start(start_time_ns); + Span *current_top = get_top_span_for_nested_level(exec_nested_level); + + /* current_trace_spans buffer should have been allocated */ + Assert(current_trace_spans != NULL); + if (new_nested_level) + { + begin_top_span(current_top, commandType, query, jstate, query_text, top_span_start_ns); + } + else if (current_top->ended == true) + { + /* + * It's possible to have multiple top spans within a nested query. If + * the previous top span was closed, start a new one + */ + int span_index = get_index_from_trace_spans(); + Span *span = get_span_from_index(span_index); + + set_top_span(exec_nested_level, span_index); + begin_top_span(span, commandType, query, jstate, query_text, top_span_start_ns); + } + return top_span_start_ns; +} + +/* + * Post-parse-analyze hook + * + * Tracing can be started here if: + * - The query has a SQLCommenter with traceparent parameter with sampled flag on and passes the caller_sample_rate + * - The query passes the sample_rate + * + * If query is sampled, create a post parse span and start the top span. + */ +static void +pg_tracing_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate) +{ + Span *post_parse_span; + int post_parse_index = -1; + int64 start_post_parse_ns; + uint64 parent_id; + int64 top_span_start_ns; + + if (prev_post_parse_analyze_hook) + prev_post_parse_analyze_hook(pstate, query, jstate); + + /* + * In case there's an multi statement transaction using extended protocol, + * we will have the PREPARE of the next statement before we go through the + * ExecutorEnd of the previous statement. + * + * For now, we skip parse span and will only start the next statement + * tracing during planner hook. + * + * TODO: Find a way to keep parse span for the next statement + */ + if (!pg_tracing_mem_ctx->isReset && exec_nested_level == 0) + return; + + /* If disabled, don't trace utility statement */ + if (query->utilityStmt && !pg_tracing_track_utility) + return; + + /* Evaluate if query is sampled or not */ + check_query_sampled(pstate, NULL); + + if (current_trace.sampled == 0) + /* Query is not sampled, nothing to do */ + return; + + /* + * We want to avoid calling get_ns at the start of post parse as it will + * impact all queries and we will only use it when the query is sampled. + */ + start_post_parse_ns = get_current_ns(); + + /* + * Either we're inside a nested sampled query or we've parsed a query with + * the sampled flag, start a new level with a top span + */ + top_span_start_ns = initialize_trace_level_and_top_span(query->commandType, query, jstate, pstate->p_sourcetext, &start_post_parse_ns); + + parent_id = get_parent_id(exec_nested_level); + Assert(parent_id > 0); + + /* + * We only create parse span when we have a good idea of when the parse + * start. For nested queries created from the queryDesc + * (nested_query_start_ns == 0), we can't get a reliable time for the + * parse start. We don't create Parse span in this case for now. + */ + if (exec_nested_level == 0 || nested_query_start_ns > 0) + { + Span *parse_span = NULL; + + /* We don't have a precise time for parse end, estimate it */ + int64 end_parse_ns = start_post_parse_ns - 1000; + int parse_index = get_index_from_trace_spans(); + + parse_span = get_span_from_index(parse_index); + begin_span(parse_span, SPAN_PARSE, + ¤t_trace, parent_id, per_level_buffers[exec_nested_level].query_id, + &top_span_start_ns, exec_nested_level); + end_span(parse_span, &end_parse_ns); + } + + /* Post parse span */ + post_parse_index = get_index_from_trace_spans(); + post_parse_span = get_span_from_index(post_parse_index); + begin_span(post_parse_span, SPAN_POST_PARSE, + ¤t_trace, + parent_id, per_level_buffers[exec_nested_level].query_id, + &start_post_parse_ns, exec_nested_level); + end_span(post_parse_span, NULL); +} + +/* + * Planner hook + * Tracing can start here if the query skipped parsing (prepared query) and passes the random sample_rate. + * If query is sampled, create a plan span and start the top span if not already started. + */ +static PlannedStmt * +pg_tracing_planner_hook(Query *query, + const char *query_string, + int cursorOptions, + ParamListInfo params) +{ + PlannedStmt *result; + int span_planner_index = -1; + Span *span_planner; + + /* Evaluate if query is sampled or not */ + check_query_sampled(NULL, params); + + if (current_trace.sampled == 0 || !pg_tracing_enabled(plan_nested_level + exec_nested_level)) + { + /* No sampling */ + if (prev_planner_hook) + result = prev_planner_hook(query, query_string, cursorOptions, + params); + else + result = standard_planner(query, query_string, cursorOptions, + params); + return result; + } + + /* + * We may have skipped parsing if statement was prepared, create a new top + * span in this case. + */ + if (current_trace.sampled == 1 && pg_tracing_enabled(plan_nested_level + exec_nested_level)) + initialize_trace_level_and_top_span(query->commandType, query, NULL, query_string, NULL); + + /* Start planner span */ + span_planner_index = get_index_from_trace_spans(); + span_planner = get_span_from_index(span_planner_index); + begin_span(span_planner, SPAN_PLANNER, ¤t_trace, + get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level); + + plan_nested_level++; + PG_TRY(); + { + if (prev_planner_hook) + result = prev_planner_hook(query, query_string, cursorOptions, + params); + else + result = standard_planner(query, query_string, cursorOptions, + params); + } + PG_CATCH(); + { + plan_nested_level--; + handle_pg_error(span_planner_index, NULL); + PG_RE_THROW(); + } + PG_END_TRY(); + plan_nested_level--; + + /* End planner span */ + end_span_index(span_planner_index, NULL); + + /* If we have a prepared statement, add bound parameters to the top span */ + if (params != NULL) + { + char *paramStr = BuildParamLogString(params, + NULL, pg_tracing_max_parameter_str); + Span *current_top = get_top_span_for_nested_level(exec_nested_level); + + Assert(current_top->parameter_offset == -1); + if (paramStr != NULL) + current_top->parameter_offset = add_str_to_trace_buffer(paramStr, strlen(paramStr)); + } + return result; +} + +/* + * ExecutorStart hook: Activate query instrumentation if query is sampled + * Tracing can be started here if the query used a cached plan and passes the random sample_rate + * If query is sampled, start the top span if not already started. + */ +static void +pg_tracing_ExecutorStart(QueryDesc *queryDesc, int eflags) +{ + int executor_start_span_index = -1; + Span *executor_span; + int instrument_options = INSTRUMENT_TIMER | INSTRUMENT_ROWS; + + /* Evaluate if query is sampled or not */ + check_query_sampled(NULL, queryDesc->params); + + if (pg_tracing_instrument_buffers) + instrument_options |= INSTRUMENT_BUFFERS; + if (pg_tracing_instrument_wal) + instrument_options |= INSTRUMENT_WAL; + + if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level)) + { + /* + * In case of a cached plan, we haven't gone through neither parsing + * nor planner hook. Parallel workers will also start tracing from + * there. Create the top case in this case. + */ + initialize_trace_level_and_top_span(queryDesc->operation, NULL, NULL, queryDesc->sourceText, NULL); + + /* Start executorStart Span */ + executor_start_span_index = get_index_from_trace_spans(); + executor_span = get_span_from_index(executor_start_span_index); + begin_span(executor_span, SPAN_EXECUTOR_START, ¤t_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level); + + /* + * Activate query instrumentation. Timer and rows instrumentation are + * mandatory. + */ + queryDesc->instrument_options = instrument_options; + } + + if (prev_ExecutorStart) + prev_ExecutorStart(queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); + + if (executor_start_span_index != -1) + { + /* Allocate instrumentation in the per-query context */ + if (queryDesc->totaltime == NULL) + { + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); + queryDesc->totaltime = InstrAlloc(1, instrument_options, false); + queryDesc->planstate->instrument = InstrAlloc(1, instrument_options, false); + MemoryContextSwitchTo(oldcxt); + } + end_span_index(executor_start_span_index, NULL); + } +} + +/* + * ExecutorRun hook: track nesting depth and create executor run span. + * If the plan needs to create parallel workers, push the trace context in the parallel shared buffer. + */ +static void +pg_tracing_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, + uint64 count, bool execute_once) +{ + int span_index = -1; + + if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level)) + { + Span *executor_run_span; + + /* Start executor run span */ + span_index = get_index_from_trace_spans(); + executor_run_span = get_span_from_index(span_index); + begin_span(executor_run_span, SPAN_EXECUTOR_RUN, ¤t_trace, get_parent_id(exec_nested_level), + per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level); + + /* + * Executor run is used as the parent's span for nodes, store its' id + * in the per level buffer. + */ + per_level_buffers[exec_nested_level].executor_id = executor_run_span->span_id; + + /* + * If this query starts parallel worker, push the trace context for + * the child processes + */ + if (queryDesc->plannedstmt->parallelModeNeeded) + add_parallel_context(executor_run_span->span_id, per_level_buffers[exec_nested_level].query_id); + } + + exec_nested_level++; + PG_TRY(); + { + if (prev_ExecutorRun) + prev_ExecutorRun(queryDesc, direction, count, execute_once); + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + } + PG_CATCH(); + { + exec_nested_level--; + handle_pg_error(span_index, queryDesc); + PG_RE_THROW(); + } + PG_END_TRY(); + exec_nested_level--; + + /* + * When a query is executing the pg_tracing_spans function, sampling will + * be disabled while we have an ongoing trace and current_trace_spans will + * be NULL. Check that span_index != -1 isn't enough in this case, we need + * to check that current_trace_spans still exists. + */ + if (current_trace_spans != NULL) + end_span_index(span_index, NULL); +} + +/* + * ExecutorFinish hook: create executor finish span and track nesting depth + */ +static void +pg_tracing_ExecutorFinish(QueryDesc *queryDesc) +{ + int span_index = -1; + Span *span = NULL; + + if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level)) + { + span_index = get_index_from_trace_spans(); + span = get_span_from_index(span_index); + begin_span(span, SPAN_EXECUTOR_FINISH, ¤t_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level); + } + + exec_nested_level++; + PG_TRY(); + { + if (prev_ExecutorFinish) + prev_ExecutorFinish(queryDesc); + else + standard_ExecutorFinish(queryDesc); + } + PG_CATCH(); + { + exec_nested_level--; + handle_pg_error(span_index, queryDesc); + PG_RE_THROW(); + } + PG_END_TRY(); + exec_nested_level--; + + end_span_index(span_index, NULL); +} + +/* + * ExecutorEnd hook: + * - process queryDesc to generate span from planstate + * - end top span for the current nested level + * - end tracing if we're at the root nested level + */ +static void +pg_tracing_ExecutorEnd(QueryDesc *queryDesc) +{ + int executor_end_index = -1; + + if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level)) + { + Span *executor_end_span; + + /* Create executor end span */ + executor_end_index = get_index_from_trace_spans(); + executor_end_span = get_span_from_index(executor_end_index); + begin_span(executor_end_span, SPAN_EXECUTOR_END, ¤t_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, + NULL, exec_nested_level); + + /* Query finished normally, send 0 as error code */ + process_query_desc(queryDesc, 0); + } + + if (prev_ExecutorEnd) + prev_ExecutorEnd(queryDesc); + else + standard_ExecutorEnd(queryDesc); + + if (executor_end_index > -1) + { + int64 end_span_ns = get_current_ns(); + + /* End top span, executor_end span and tracing */ + end_top_span(exec_nested_level, end_span_ns); + end_span_index(executor_end_index, &end_span_ns); + end_tracing(&end_span_ns); + } +} + +/* + * ProcessUtility hook + * + * Trace utility query if utility tracking is enabled and sampling was enabled + * during parse step. + */ +static void +pg_tracing_ProcessUtility(PlannedStmt *pstmt, const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, QueryEnvironment *queryEnv, + DestReceiver *dest, QueryCompletion *qc) +{ + Span *process_utility_span; + Span *previous_top_span; + int process_utility_span_index = -1; + int previous_top_span_index = -1; + int64 end_span_ns; + int64 start_span_ns; + + if (!pg_tracing_track_utility || current_trace.sampled == 0 || !pg_tracing_enabled(exec_nested_level)) + { + if (prev_ProcessUtility) + prev_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + else + standard_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + + /* + * Tracing may have been started within the utility call without going + * through ExecutorEnd (ex: prepare statement). Check and end tracing + * here in this case. + */ + if (!pg_tracing_mem_ctx->isReset) + { + Assert(current_trace.sampled); + Assert(exec_nested_level == 0); + end_tracing(NULL); + } + return; + } + + start_span_ns = get_current_ns(); + + /* + * Set a possible start for nested query. ProcessUtility may start a + * nested query and we can't use the statement start like we usually do. + * We artificially start nested 1us later to avoid overlapping spans. + */ + nested_query_start_ns = start_span_ns + 1000; + + /* Build the process utility span. */ + process_utility_span_index = get_index_from_trace_spans(); + process_utility_span = get_span_from_index(process_utility_span_index); + begin_span(process_utility_span, SPAN_PROCESS_UTILITY, ¤t_trace, + get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, &start_span_ns, exec_nested_level); + + /* Extract and normalise the query */ + if (queryString != NULL && pstmt->stmt_location != -1 && pstmt->stmt_len > 0) + { + int query_len = pstmt->stmt_len; + const char *normalised_query = normalise_query(queryString, pstmt->stmt_location, &query_len); + + process_utility_span->operation_name_offset = add_str_to_trace_buffer(normalised_query, query_len); + } + + /* + * Set ProcessUtility span as the new top span for the nested queries that + * may be created. We need to keep the old top span index to end it at the + * end of the process utility hook. + */ + previous_top_span_index = per_level_buffers[exec_nested_level].top_span_index; + set_top_span(exec_nested_level, process_utility_span_index); + + exec_nested_level++; + PG_TRY(); + { + if (prev_ProcessUtility) + prev_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + else + standard_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + } + PG_CATCH(); + { + exec_nested_level--; + handle_pg_error(previous_top_span_index, NULL); + PG_RE_THROW(); + } + PG_END_TRY(); + end_span_ns = get_current_ns(); + + /* ProcessUtility may have created a top span, end it. */ + if (exec_nested_level <= max_nested_level && per_level_buffers[exec_nested_level].top_span_index != -1) + { + Span *top_span = get_top_span_for_nested_level(exec_nested_level); + + if (top_span->ended == false) + end_top_span(exec_nested_level, end_span_ns); + } + exec_nested_level--; + + /* + * buffer may have been repalloced, refresh the process_utility_span + * variable + */ + process_utility_span = get_span_from_index(process_utility_span_index); + if (qc != NULL) + process_utility_span->node_counters.rows = qc->nprocessed; + /* ProcessUtility is the current top span, no need to call end_top_span */ + end_span(process_utility_span, &end_span_ns); + + /* Also end the previous top span */ + previous_top_span = get_span_from_index(previous_top_span_index); + end_span(previous_top_span, &end_span_ns); + + Assert(previous_top_span->start < process_utility_span->start); + Assert(get_span_end_ns(previous_top_span) >= get_span_end_ns(process_utility_span)); + + end_tracing(&end_span_ns); + + /* + * We may iterate through a list of nested ProcessUtility calls, update + * the nested query start. + */ + nested_query_start_ns = end_span_ns + 1000; +} + +/* + * Iterate over a list of planstate to generate span node + */ +static void +generate_member_nodes(PlanState **planstates, int nplans, planstateTraceContext * planstateTraceContext, uint64 parent_id) +{ + int j; + + for (j = 0; j < nplans; j++) + generate_span_from_planstate(planstates[j], planstateTraceContext, parent_id); +} + +/* + * Iterate over custom scan planstates to generate span node + */ +static void +generate_span_from_custom_scan(CustomScanState *css, planstateTraceContext * planstateTraceContext, uint64 parent_id) +{ + ListCell *cell; + + foreach(cell, css->custom_ps) + generate_span_from_planstate((PlanState *) lfirst(cell), planstateTraceContext, parent_id); +} + +/* + * With nested queries, the post parse hook will be called before we know + * the top span parent. We will only know and the parent when we generate span nodes from planstate. + * + * When those nodes are created, we attach the top span. + */ +static void +attach_top_span_to_new_parent(Span const *new_parent, int64 parent_end_ns_time) +{ + Span *child_top_span; + int64 child_end_ns_time; + + if (exec_nested_level + 1 > max_nested_level) + /* No child to attach */ + return; + + child_top_span = get_top_span_for_nested_level(exec_nested_level + 1); + Assert(child_top_span->start_ns_time > 0); + /* If child start after parent's end, don't link it */ + if (parent_end_ns_time < child_top_span->start_ns_time) + return; + child_end_ns_time = child_top_span->start_ns_time + child_top_span->duration_ns; + /* If child ends before parent's start, don't link it */ + if (child_end_ns_time < new_parent->start_ns_time) + return; + + /* Child span should already have ended */ + Assert(child_top_span->ended == true); + /* Link this child to the new parent */ + child_top_span->parent_id = new_parent->span_id; +} + +/* + * Create span node for the provided planstate + */ +static uint64 +create_span_node(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id, + SpanType span_type, char *subplan_name) +{ + Span *span; + int span_index = -1; + char const *span_name; + char const *operation_name; + Plan const *plan = planstate->plan; + int64 span_end_ns; + int64 start_span_ns; + + /* + * Make sure stats accumulation is done. Function is a no-op if if was + * already done. + */ + InstrEndLoop(planstate->instrument); + + /* We only create span node on node that were executed */ + Assert(!INSTR_TIME_IS_ZERO(planstate->instrument->firsttime)); + + start_span_ns = INSTR_TIME_GET_NANOSEC(planstate->instrument->firsttime); + span_index = get_index_from_trace_spans(); + span = get_span_from_index(span_index); + + begin_span(span, span_type, planstateTraceContext->trace, parent_id, per_level_buffers[exec_nested_level].query_id, + &start_span_ns, exec_nested_level); + + /* first tuple time */ + span->startup = planstate->instrument->startup * NS_PER_S; + + if (span_type == SPAN_NODE) + { + /* Generate names and store them */ + span_name = plan_to_span_name(plan); + operation_name = plan_to_operation(planstateTraceContext, planstate, span_name); + + span->name_offset = add_str_to_trace_buffer(span_name, strlen(span_name)); + span->operation_name_offset = add_str_to_trace_buffer(operation_name, strlen(operation_name)); + } + else if (subplan_name != NULL) + { + span->operation_name_offset = add_str_to_trace_buffer(subplan_name, strlen(subplan_name)); + } + + span->node_counters.rows = (int64) planstate->instrument->ntuples / planstate->instrument->nloops; + span->node_counters.nloops = (int64) planstate->instrument->nloops; + span->node_counters.buffer_usage = planstate->instrument->bufusage; + span->node_counters.wal_usage = planstate->instrument->walusage; + + span->plan_counters.startup_cost = plan->startup_cost; + span->plan_counters.total_cost = plan->total_cost; + span->plan_counters.plan_rows = plan->plan_rows; + span->plan_counters.plan_width = plan->plan_width; + + if (!planstate->state->es_finished) + { + /* + * If the query is in an unfinished state, it means that we're in an + * error handler. Stop the node instrumentation to get the latest + * known state. + */ + InstrStopNode(planstate->instrument, planstate->state->es_processed); + InstrEndLoop(planstate->instrument); + span->sql_error_code = planstateTraceContext->sql_error_code; + } + span_end_ns = INSTR_TIME_GET_NANOSEC(planstate->instrument->firsttime); + Assert(planstate->instrument->total > 0); + span_end_ns += planstate->instrument->total * NS_PER_S; + end_span(span, &span_end_ns); + + /* + * If we have a Result node, make it the span parent of the next query + * span if we have any + * + * TODO: Do we have other node that could be the base for nested queries? + */ + if (nodeTag(plan) == T_Result || nodeTag(plan) == T_ProjectSet) + attach_top_span_to_new_parent(span, span_end_ns); + return span->span_id; +} + +/* + * Walk through the planstate tree generating a node span for each node. + * We pass possible error code to tag unfinished node with it + */ +static void +generate_span_from_planstate(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id) +{ + ListCell *l; + Plan const *plan = planstate->plan; + uint64 span_id; + + /* The node was never executed, skip it */ + if (INSTR_TIME_IS_ZERO(planstate->instrument->firsttime)) + return; + + span_id = create_span_node(planstate, planstateTraceContext, parent_id, SPAN_NODE, NULL); + planstateTraceContext->ancestors = lcons(planstate->plan, planstateTraceContext->ancestors); + + /* Walk the lefttree */ + if (outerPlanState(planstate)) + generate_span_from_planstate(outerPlanState(planstate), planstateTraceContext, span_id); + /* Walk the righttree */ + if (innerPlanState(planstate)) + generate_span_from_planstate(innerPlanState(planstate), planstateTraceContext, span_id); + + /* Handle init plans */ + foreach(l, planstate->initPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(l); + PlanState *splan = sstate->planstate; + uint64_t init_span_id; + + if (INSTR_TIME_IS_ZERO(splan->instrument->firsttime)) + continue; + init_span_id = create_span_node(splan, planstateTraceContext, parent_id, SPAN_NODE_INIT_PLAN, sstate->subplan->plan_name); + generate_span_from_planstate(splan, planstateTraceContext, init_span_id); + } + + /* Handle sub plans */ + foreach(l, planstate->subPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(l); + PlanState *splan = sstate->planstate; + uint64_t subplan_span_id; + + if (INSTR_TIME_IS_ZERO(splan->instrument->firsttime)) + continue; + subplan_span_id = create_span_node(splan, planstateTraceContext, parent_id, SPAN_NODE_SUBPLAN, sstate->subplan->plan_name); + generate_span_from_planstate(splan, planstateTraceContext, subplan_span_id); + } + + /* Handle special nodes with children nodes */ + switch (nodeTag(plan)) + { + case T_Append: + generate_member_nodes(((AppendState *) planstate)->appendplans, + ((AppendState *) planstate)->as_nplans, planstateTraceContext, span_id); + break; + case T_MergeAppend: + generate_member_nodes(((MergeAppendState *) planstate)->mergeplans, + ((MergeAppendState *) planstate)->ms_nplans, planstateTraceContext, span_id); + break; + case T_BitmapAnd: + generate_member_nodes(((BitmapAndState *) planstate)->bitmapplans, + ((BitmapAndState *) planstate)->nplans, planstateTraceContext, span_id); + break; + case T_BitmapOr: + generate_member_nodes(((BitmapOrState *) planstate)->bitmapplans, + ((BitmapOrState *) planstate)->nplans, planstateTraceContext, span_id); + break; + case T_SubqueryScan: + generate_span_from_planstate(((SubqueryScanState *) planstate)->subplan, planstateTraceContext, span_id); + break; + case T_CustomScan: + generate_span_from_custom_scan((CustomScanState *) planstate, planstateTraceContext, span_id); + break; + default: + break; + } + planstateTraceContext->ancestors = list_delete_first(planstateTraceContext->ancestors); +} + +/* + * Add plan counters to the Datum output + */ +static int +add_plan_counters(const PlanCounters * plan_counters, int i, Datum *values) +{ + values[i++] = Float8GetDatumFast(plan_counters->startup_cost); + values[i++] = Float8GetDatumFast(plan_counters->total_cost); + values[i++] = Float8GetDatumFast(plan_counters->plan_rows); + values[i++] = Int32GetDatum(plan_counters->plan_width); + return i; +} + +/* + * Add node counters to the Datum output + */ +static int +add_node_counters(const NodeCounters * node_counters, int i, Datum *values) +{ + Datum wal_bytes; + char buf[256]; + double blk_read_time, + blk_write_time, + temp_blk_read_time, + temp_blk_write_time; + double generation_counter, + inlining_counter, + optimization_counter, + emission_counter; + int64 jit_created_functions; + + values[i++] = Int64GetDatumFast(node_counters->rows); + values[i++] = Int64GetDatumFast(node_counters->nloops); + + /* Buffer usage */ + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_hit); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_read); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_dirtied); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_written); + + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_hit); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_read); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_dirtied); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_written); + + blk_read_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.blk_read_time); + blk_write_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.blk_write_time); + temp_blk_read_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.temp_blk_read_time); + temp_blk_write_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.temp_blk_write_time); + + values[i++] = Float8GetDatumFast(blk_read_time); + values[i++] = Float8GetDatumFast(blk_write_time); + values[i++] = Float8GetDatumFast(temp_blk_read_time); + values[i++] = Float8GetDatumFast(temp_blk_write_time); + + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.temp_blks_read); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.temp_blks_written); + + /* WAL usage */ + values[i++] = Int64GetDatumFast(node_counters->wal_usage.wal_records); + values[i++] = Int64GetDatumFast(node_counters->wal_usage.wal_fpi); + snprintf(buf, sizeof buf, UINT64_FORMAT, node_counters->wal_usage.wal_bytes); + + /* Convert to numeric. */ + wal_bytes = DirectFunctionCall3(numeric_in, + CStringGetDatum(buf), + ObjectIdGetDatum(0), + Int32GetDatum(-1)); + values[i++] = wal_bytes; + + /* JIT usage */ + generation_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.generation_counter); + inlining_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.inlining_counter); + optimization_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.optimization_counter); + emission_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.emission_counter); + jit_created_functions = node_counters->jit_usage.created_functions; + + values[i++] = Int64GetDatumFast(jit_created_functions); + values[i++] = Float8GetDatumFast(generation_counter); + values[i++] = Float8GetDatumFast(inlining_counter); + values[i++] = Float8GetDatumFast(optimization_counter); + values[i++] = Float8GetDatumFast(emission_counter); + + return i; +} + +/* + * Build the tuple for a Span and add it to the output + */ +static void +add_result_span(ReturnSetInfo *rsinfo, Span * span, + const char *qbuffer, Size qbuffer_size) +{ + Datum values[PG_TRACING_TRACES_COLS] = {0}; + bool nulls[PG_TRACING_TRACES_COLS] = {0}; + const char *span_name; + const char *operation_name; + const char *sql_error_code; + int i = 0; + + span_name = get_span_name(span, qbuffer, qbuffer_size); + operation_name = get_operation_name(span, qbuffer, qbuffer_size); + sql_error_code = unpack_sql_state(span->sql_error_code); + + Assert(span_name != NULL); + Assert(operation_name != NULL); + Assert(sql_error_code != NULL); + + values[i++] = UInt64GetDatum(span->trace_id); + values[i++] = UInt64GetDatum(span->parent_id); + values[i++] = UInt64GetDatum(span->span_id); + values[i++] = UInt64GetDatum(span->query_id); + values[i++] = CStringGetTextDatum(span_name); + values[i++] = CStringGetTextDatum(operation_name); + values[i++] = Int64GetDatumFast(span->start); + values[i++] = Int16GetDatum(span->start_ns); + + values[i++] = Int64GetDatumFast(span->duration_ns); + values[i++] = CStringGetTextDatum(sql_error_code); + values[i++] = UInt32GetDatum(span->be_pid); + values[i++] = UInt8GetDatum(span->nested_level); + values[i++] = UInt8GetDatum(span->subxact_count); + values[i++] = BoolGetDatum(span->is_top_span); + + if ((span->type >= SPAN_NODE && span->type <= SPAN_NODE_UNKNOWN) + || span->type == SPAN_PLANNER) + { + i = add_plan_counters(&span->plan_counters, i, values); + i = add_node_counters(&span->node_counters, i, values); + + values[i++] = Int64GetDatumFast(span->startup); + if (span->parameter_offset != -1 && qbuffer_size > 0 && qbuffer_size > span->parameter_offset) + values[i++] = CStringGetTextDatum(qbuffer + span->parameter_offset); + } + + for (int j = i; j < PG_TRACING_TRACES_COLS; j++) + nulls[j] = 1; + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); +} + +/* + * Return spans as a result set. + * + * Accept a consume parameter. When consume is set, + * we empty the shared buffer and truncate query text. + */ +Datum +pg_tracing_spans(PG_FUNCTION_ARGS) +{ + bool consume; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Span *span; + const char *qbuffer; + Size qbuffer_size = 0; + + consume = PG_GETARG_BOOL(0); + if (!pg_tracing) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("pg_tracing must be loaded via shared_preload_libraries"))); + InitMaterializedSRF(fcinfo, 0); + + /* + * If this query was sampled and we're consuming tracing_spans buffer, the + * spans will target a query string that doesn't exist anymore in the + * query file. Better abort the sampling, clean ongoing traces and remove + * any possible parallel context pushed Since this will be called within + * an ExecutorRun, we will need to check for current_trace_spans at the + * end of the ExecutorRun hook. + */ + cleanup_tracing(); + + qbuffer = qtext_load_file(&qbuffer_size); + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + for (int i = 0; i < shared_spans->end; i++) + { + span = shared_spans->spans + i; + add_result_span(rsinfo, span, qbuffer, qbuffer_size); + } + + /* Consume is set, remove spans from the shared buffer */ + if (consume) + { + shared_spans->end = 0; + /* We only truncate query file if there's no active writers */ + if (s->n_writers == 0) + { + s->extent = 0; + pg_truncate(PG_TRACING_TEXT_FILE, 0); + } + else + s->stats.failed_truncates++; + } + s->stats.last_consume = GetCurrentTimestamp(); + SpinLockRelease(&s->mutex); + } + return (Datum) 0; +} + +/* + * Return statistics of pg_tracing. + */ +Datum +pg_tracing_info(PG_FUNCTION_ARGS) +{ + pgTracingStats stats; + TupleDesc tupdesc; + Datum values[PG_TRACING_INFO_COLS] = {0}; + bool nulls[PG_TRACING_INFO_COLS] = {0}; + int i = 0; + + if (!pg_tracing) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("pg_tracing must be loaded via shared_preload_libraries"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + /* Get a copy of the pg_tracing stats */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + stats = s->stats; + SpinLockRelease(&s->mutex); + } + + values[i++] = Int64GetDatum(stats.traces); + values[i++] = Int64GetDatum(stats.spans); + values[i++] = Int64GetDatum(stats.dropped_spans); + values[i++] = Int64GetDatum(stats.failed_truncates); + values[i++] = TimestampTzGetDatum(stats.last_consume); + values[i++] = TimestampTzGetDatum(stats.stats_reset); + values[i++] = Float8GetDatum(pg_tracing_sample_rate); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +/* + * Get an empty pgTracingStats + */ +static pgTracingStats +get_empty_pg_tracing_stats(void) +{ + pgTracingStats stats; + + stats.traces = 0; + stats.spans = 0; + stats.dropped_spans = 0; + stats.failed_truncates = 0; + stats.last_consume = 0; + stats.stats_reset = GetCurrentTimestamp(); + return stats; +} + +/* + * Reset pg_tracing statistics. + */ +Datum +pg_tracing_reset(PG_FUNCTION_ARGS) +{ + /* + * Reset statistics for pg_tracing since all entries are removed. + */ + pgTracingStats empty_stats = get_empty_pg_tracing_stats(); + + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->stats = empty_stats; + SpinLockRelease(&s->mutex); + } + PG_RETURN_VOID(); +} diff --git a/contrib/pg_tracing/pg_tracing.conf b/contrib/pg_tracing/pg_tracing.conf new file mode 100644 index 0000000000..ad6b5e7c1d --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.conf @@ -0,0 +1 @@ +shared_preload_libraries = 'pg_tracing' diff --git a/contrib/pg_tracing/pg_tracing.control b/contrib/pg_tracing/pg_tracing.control new file mode 100644 index 0000000000..efa78a6610 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.control @@ -0,0 +1,5 @@ +# pg_tracing extension +comment = 'Distributed tracing for postgres' +default_version = '1.0' +module_pathname = '$libdir/pg_tracing' +relocatable = true diff --git a/contrib/pg_tracing/pg_tracing.h b/contrib/pg_tracing/pg_tracing.h new file mode 100644 index 0000000000..63675c4b58 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.h @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * pg_tracing.h + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing.h + *------------------------------------------------------------------------- + */ +#ifndef _PG_TRACING_H_ +#define _PG_TRACING_H_ + +#include "postgres.h" +#include "span.h" + +#include "storage/s_lock.h" + +/* + * Global statistics for pg_tracing + */ +typedef struct pgTracingStats +{ + int64 traces; /* number of traces processed */ + int64 spans; /* number of spans processed */ + int64 dropped_spans; /* number of dropped spans due to full buffer */ + int64 failed_truncates; /* number of failed query file truncates + * due to active writer */ + TimestampTz last_consume; /* Last time the shared spans buffer was + * consumed */ + TimestampTz stats_reset; /* Last time stats were reset */ +} pgTracingStats; + +/* + * Global shared state + */ +typedef struct pgTracingSharedState +{ + slock_t mutex; /* protects shared stats fields and + * shared_spans buffer */ + Size extent; /* current extent of query file */ + int n_writers; /* number of active writers to query file */ + pgTracingStats stats; /* global statistics for pg_tracing */ +} pgTracingSharedState; + +#endif diff --git a/contrib/pg_tracing/pg_tracing_explain.c b/contrib/pg_tracing/pg_tracing_explain.c new file mode 100644 index 0000000000..7f7155d2d3 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_explain.c @@ -0,0 +1,527 @@ +/*------------------------------------------------------------------------- + * + * pg_tracing_explain.c + * + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing_explain.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_tracing_explain.h" + +#include "nodes/makefuncs.h" +#include "utils/builtins.h" +#include "parser/parsetree.h" +#include "nodes/extensible.h" +#include "utils/lsyscache.h" +#include "utils/ruleutils.h" + + +static void +ExplainTargetRel(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, Index rti, StringInfo str) +{ + char *objectname = NULL; + char *namespace = NULL; + RangeTblEntry *rte; + char *refname; + + List *rtable; + + rtable = planstate->state->es_range_table; + rte = rt_fetch(rti, rtable); + + refname = (char *) list_nth(planstateTraceContext->rtable_names, rti - 1); + if (refname == NULL) + refname = rte->eref->aliasname; + + switch (nodeTag(planstate->plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_IndexScan: + case T_IndexOnlyScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_ForeignScan: + case T_CustomScan: + case T_ModifyTable: + /* Assert it's on a real relation */ + Assert(rte->rtekind == RTE_RELATION); + objectname = get_rel_name(rte->relid); + break; + case T_FunctionScan: + { + FunctionScan *fscan = (FunctionScan *) planstate->plan; + + /* Assert it's on a RangeFunction */ + Assert(rte->rtekind == RTE_FUNCTION); + + /* + * If the expression is still a function call of a single + * function, we can get the real name of the function. + * Otherwise, punt. (Even if it was a single function call + * originally, the optimizer could have simplified it away.) + */ + if (list_length(fscan->functions) == 1) + { + RangeTblFunction *rtfunc = (RangeTblFunction *) linitial(fscan->functions); + + if (IsA(rtfunc->funcexpr, FuncExpr)) + { + FuncExpr *funcexpr = (FuncExpr *) rtfunc->funcexpr; + Oid funcid = funcexpr->funcid; + + objectname = get_func_name(funcid); + } + } + } + break; + case T_TableFuncScan: + Assert(rte->rtekind == RTE_TABLEFUNC); + objectname = "xmltable"; + break; + case T_ValuesScan: + Assert(rte->rtekind == RTE_VALUES); + break; + case T_CteScan: + /* Assert it's on a non-self-reference CTE */ + Assert(rte->rtekind == RTE_CTE); + Assert(!rte->self_reference); + objectname = rte->ctename; + break; + case T_NamedTuplestoreScan: + Assert(rte->rtekind == RTE_NAMEDTUPLESTORE); + objectname = rte->enrname; + break; + case T_WorkTableScan: + /* Assert it's on a self-reference CTE */ + Assert(rte->rtekind == RTE_CTE); + Assert(rte->self_reference); + objectname = rte->ctename; + break; + default: + break; + } + + appendStringInfoString(str, " on"); + if (namespace != NULL) + appendStringInfo(str, " %s.%s", quote_identifier(namespace), + quote_identifier(objectname)); + else if (objectname != NULL) + appendStringInfo(str, " %s", quote_identifier(objectname)); + if (objectname == NULL || strcmp(refname, objectname) != 0) + appendStringInfo(str, " %s", quote_identifier(refname)); +} + + +static void +ExplainScanTarget(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const Scan *plan, StringInfo str) +{ + ExplainTargetRel(planstateTraceContext, planstate, plan->scanrelid, str); +} + +static void +ExplainModifyTarget(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const ModifyTable *plan, StringInfo str) +{ + ExplainTargetRel(planstateTraceContext, planstate, plan->nominalRelation, str); +} + +static void +ExplainIndexScanDetails(Oid indexId, ScanDirection indexorderdir, + StringInfo str) +{ + const char *indexname = get_rel_name(indexId); + + if (ScanDirectionIsBackward(indexorderdir)) + appendStringInfoString(str, " Backward"); + appendStringInfo(str, " using %s", quote_identifier(indexname)); +} + +static void +plan_to_rel_name(const PlanState *planstate, const planstateTraceContext * planstateTraceContext, StringInfo str) +{ + const Plan *plan = planstate->plan; + + switch (nodeTag(plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_SubqueryScan: + case T_FunctionScan: + case T_TableFuncScan: + case T_ValuesScan: + case T_CteScan: + case T_WorkTableScan: + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) plan, str); + break; + case T_ForeignScan: + case T_CustomScan: + if (((Scan *) plan)->scanrelid > 0) + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) plan, str); + break; + case T_IndexScan: + { + IndexScan *indexscan = (IndexScan *) plan; + + ExplainIndexScanDetails(indexscan->indexid, + indexscan->indexorderdir, + str); + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) indexscan, str); + } + break; + case T_IndexOnlyScan: + { + IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; + + ExplainIndexScanDetails(indexonlyscan->indexid, + indexonlyscan->indexorderdir, + str); + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) indexonlyscan, str); + } + break; + case T_BitmapIndexScan: + { + BitmapIndexScan *bitmapindexscan = (BitmapIndexScan *) plan; + const char *indexname = get_rel_name(bitmapindexscan->indexid); + + appendStringInfo(str, " on %s", + quote_identifier(indexname)); + } + break; + case T_ModifyTable: + ExplainModifyTarget(planstateTraceContext, planstate, (ModifyTable *) plan, str); + break; + case T_NestLoop: + case T_MergeJoin: + case T_HashJoin: + { + const char *jointype; + + switch (((Join *) plan)->jointype) + { + case JOIN_INNER: + jointype = "Inner"; + break; + case JOIN_LEFT: + jointype = "Left"; + break; + case JOIN_FULL: + jointype = "Full"; + break; + case JOIN_RIGHT: + jointype = "Right"; + break; + case JOIN_SEMI: + jointype = "Semi"; + break; + case JOIN_ANTI: + jointype = "Anti"; + break; + case JOIN_RIGHT_ANTI: + jointype = "Right Anti"; + break; + default: + jointype = "???"; + break; + } + if (((Join *) plan)->jointype != JOIN_INNER) + appendStringInfo(str, " %s Join", jointype); + else if (!IsA(plan, NestLoop)) + appendStringInfoString(str, " Join"); + } + break; + case T_SetOp: + { + const char *setopcmd; + + switch (((SetOp *) plan)->cmd) + { + case SETOPCMD_INTERSECT: + setopcmd = "Intersect"; + break; + case SETOPCMD_INTERSECT_ALL: + setopcmd = "Intersect All"; + break; + case SETOPCMD_EXCEPT: + setopcmd = "Except"; + break; + case SETOPCMD_EXCEPT_ALL: + setopcmd = "Except All"; + break; + default: + setopcmd = "???"; + break; + } + appendStringInfo(str, " %s", setopcmd); + } + break; + default: + break; + } +} + +static void +add_scan_qual(StringInfo str, const PlanState *planstate, List *qual, + const char *qlabel, List *ancestors, List *deparse_ctx, + bool useprefix) +{ + List *context; + char *exprstr; + Node *node; + + if (qual == NIL) + return; + + node = (Node *) make_ands_explicit(qual); + + /* Set up deparsing context */ + context = set_deparse_context_plan(deparse_ctx, + planstate->plan, + ancestors); + + /* Deparse the expression */ + appendStringInfoString(str, "|"); + exprstr = deparse_expression(node, context, useprefix, false); + appendStringInfoString(str, qlabel); + appendStringInfoString(str, exprstr); +} + +char const * +plan_to_operation(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const char *spanName) +{ + StringInfo operation_name = makeStringInfo(); + Plan const *plan = planstate->plan; + List *ancestors = planstateTraceContext->ancestors; + List *deparse_ctx = planstateTraceContext->deparse_ctx; + + if (plan->parallel_aware) + appendStringInfoString(operation_name, "Parallel "); + if (plan->async_capable) + appendStringInfoString(operation_name, "Async "); + appendStringInfoString(operation_name, spanName); + + plan_to_rel_name(planstate, planstateTraceContext, operation_name); + + switch (nodeTag(plan)) + { + case T_IndexScan: + add_scan_qual(operation_name, planstate, ((IndexScan *) plan)->indexqualorig, "Index Cond: ", + ancestors, deparse_ctx, false); + break; + case T_IndexOnlyScan: + add_scan_qual(operation_name, planstate, ((IndexOnlyScan *) plan)->indexqual, "Index Cond: ", + ancestors, deparse_ctx, false); + break; + case T_BitmapIndexScan: + add_scan_qual(operation_name, planstate, ((BitmapIndexScan *) plan)->indexqualorig, "Index Cond: ", + ancestors, deparse_ctx, false); + break; + case T_BitmapHeapScan: + add_scan_qual(operation_name, planstate, ((BitmapHeapScan *) plan)->bitmapqualorig, "Recheck Cond: ", + ancestors, deparse_ctx, false); + break; + case T_SeqScan: + case T_ValuesScan: + case T_CteScan: + case T_NamedTuplestoreScan: + case T_WorkTableScan: + case T_SubqueryScan: + case T_Gather: + case T_GatherMerge: + case T_FunctionScan: + case T_TableFuncScan: + case T_TidRangeScan: + case T_ForeignScan: + case T_CustomScan: + case T_Agg: + case T_WindowAgg: + case T_Group: + case T_Result: + add_scan_qual(operation_name, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + case T_NestLoop: + add_scan_qual(operation_name, planstate, ((NestLoop *) plan)->join.joinqual, "Join Filter : ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + case T_MergeJoin: + add_scan_qual(operation_name, planstate, ((MergeJoin *) plan)->mergeclauses, "Merge Cond: ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, ((MergeJoin *) plan)->join.joinqual, "Join Filter: ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + case T_HashJoin: + add_scan_qual(operation_name, planstate, ((HashJoin *) plan)->hashclauses, "Hash Cond: ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, ((MergeJoin *) plan)->join.joinqual, "Join Filter: ", + ancestors, deparse_ctx, false); + add_scan_qual(operation_name, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + default: + break; + } + + return operation_name->data; +} + +const char * +plan_to_span_name(const Plan *plan) +{ + const char *custom_name; + + switch (nodeTag(plan)) + { + case T_Result: + return "Result"; + case T_ProjectSet: + return "ProjectSet"; + case T_ModifyTable: + switch (((ModifyTable *) plan)->operation) + { + case CMD_INSERT: + return "Insert"; + case CMD_UPDATE: + return "Update"; + case CMD_DELETE: + return "Delete"; + case CMD_MERGE: + return "Merge"; + default: + return "???"; + } + case T_Append: + return "Append"; + case T_MergeAppend: + return "MergeAppend"; + case T_RecursiveUnion: + return "RecursiveUnion"; + case T_BitmapAnd: + return "BitmapAnd"; + case T_BitmapOr: + return "BitmapOr"; + case T_NestLoop: + return "NestedLoop"; + case T_MergeJoin: + return "Merge"; /* "Join" gets added by jointype switch */ + case T_HashJoin: + return "Hash"; /* "Join" gets added by jointype switch */ + case T_SeqScan: + return "SeqScan"; + case T_SampleScan: + return "SampleScan"; + case T_Gather: + return "Gather"; + case T_GatherMerge: + return "GatherMerge"; + case T_IndexScan: + return "IndexScan"; + case T_IndexOnlyScan: + return "IndexOnlyScan"; + case T_BitmapIndexScan: + return "BitmapIndexScan"; + case T_BitmapHeapScan: + return "BitmapHeapScan"; + case T_TidScan: + return "TidScan"; + case T_TidRangeScan: + return "TidRangeScan"; + case T_SubqueryScan: + return "SubqueryScan"; + case T_FunctionScan: + return "FunctionScan"; + case T_TableFuncScan: + return "TableFunctionScan"; + case T_ValuesScan: + return "ValuesScan"; + case T_CteScan: + return "CTEScan"; + case T_NamedTuplestoreScan: + return "NamedTuplestoreScan"; + case T_WorkTableScan: + return "WorkTableScan"; + case T_ForeignScan: + switch (((ForeignScan *) plan)->operation) + { + case CMD_SELECT: + return "ForeignScan"; + case CMD_INSERT: + return "ForeignInsert"; + case CMD_UPDATE: + return "ForeignUpdate"; + case CMD_DELETE: + return "ForeignDelete"; + default: + return "???"; + } + case T_CustomScan: + custom_name = ((CustomScan *) plan)->methods->CustomName; + if (custom_name) + return psprintf("CustomScan (%s)", custom_name); + else + return "CustomScan"; + case T_Material: + return "Materialize"; + case T_Memoize: + return "Memoize"; + case T_Sort: + return "Sort"; + case T_IncrementalSort: + return "IncrementalSort"; + case T_Group: + return "Group"; + case T_Agg: + { + Agg *agg = (Agg *) plan; + + switch (agg->aggstrategy) + { + case AGG_PLAIN: + return "Aggregate"; + case AGG_SORTED: + return "GroupAggregate"; + case AGG_HASHED: + return "HashAggregate"; + case AGG_MIXED: + return "MixedAggregate"; + default: + return "Aggregate ???"; + } + } + case T_WindowAgg: + return "WindowAgg"; + case T_Unique: + return "Unique"; + case T_SetOp: + switch (((SetOp *) plan)->strategy) + { + case SETOP_SORTED: + return "SetOp"; + case SETOP_HASHED: + return "HashSetOp"; + default: + return "SetOp ???"; + } + case T_LockRows: + return "LockRows"; + case T_Limit: + return "Limit"; + case T_Hash: + return "Hash"; + default: + return "???"; + } +} diff --git a/contrib/pg_tracing/pg_tracing_explain.h b/contrib/pg_tracing/pg_tracing_explain.h new file mode 100644 index 0000000000..6551e15d61 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_explain.h @@ -0,0 +1,20 @@ +#ifndef _PG_TRACING_EXPLAIN_H_ +#define _PG_TRACING_EXPLAIN_H_ + +#include "span.h" +#include "executor/execdesc.h" + +/* Context needed when generating spans from planstate */ +typedef struct planstateTraceContext +{ + pgTracingTrace *trace; + int sql_error_code; + List *ancestors; + List *deparse_ctx; + List *rtable_names; +} planstateTraceContext; + +extern const char *plan_to_span_name(const Plan *plan); +extern const char *plan_to_operation(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const char *spanName); + +#endif diff --git a/contrib/pg_tracing/query_process.c b/contrib/pg_tracing/query_process.c new file mode 100644 index 0000000000..0d6e93bbe5 --- /dev/null +++ b/contrib/pg_tracing/query_process.c @@ -0,0 +1,473 @@ +#include "query_process.h" +#include "pg_tracing.h" + +#include +#include +#include "parser/scanner.h" +#include "nodes/extensible.h" + +/* + * Initialize traceparent parameter fields + */ +void +initialize_traceparent(pgTracingTrace * traceparent_parameter) +{ + traceparent_parameter->sampled = 0; + traceparent_parameter->trace_id = 0; + traceparent_parameter->parent_id = 0; +} + +/* + * Extract traceparent parameters from SQLCommenter + * + * We're expecting the query to start with a SQLComment containing the + * traceparent parameter + * "/\*traceparent='00-00000000000000000000000000000009-0000000000000005-01'*\/ + * SELECT 1;" + * Traceparent has the following format: version-traceid-parentid-sampled + * We also accept SQLCommenter as a parameter. In this case, we don't have + * the start and end of comments. + * + * if is_parameter is true, the SQLComment is passed as a query parameter and + * won't have the comment start and end. + */ +void +extract_traceparent(pgTracingTrace * pgTracingTrace, const char *sqlcomment_str, bool is_parameter) +{ + const char *expected_start = "/*"; + const char *traceparent; + const char *end_sqlcomment; + char *endptr; + + if (!is_parameter) + { + /* + * Look for the start of a comment We're expecting the comment to be + * at the very start of the query + */ + for (size_t i = 0; i < strlen(expected_start); i++) + { + if (sqlcomment_str[i] != expected_start[i]) + return; + } + + /* + * Check that we have an end + */ + end_sqlcomment = strstr(sqlcomment_str, "*/"); + if (end_sqlcomment == NULL) + return; + } + else + end_sqlcomment = sqlcomment_str + strlen(sqlcomment_str); + + /* + * Locate traceparent parameter and make sure it has the expected size + * "traceparent" + "=" + '' -> 13 characters + * 00-00000000000000000000000000000009-0000000000000005-01 -> 55 + * characters + */ + traceparent = strstr(sqlcomment_str, "traceparent='"); + if (traceparent == NULL || traceparent > end_sqlcomment || end_sqlcomment - traceparent < 55 + 13) + return; + + /* + * Move to the start of the traceparent values + */ + traceparent = traceparent + 13; + + /* + * Check that '-' are at the expected places + */ + if (traceparent[2] != '-' || traceparent[35] != '-' || traceparent[52] != '-') + return; + + /* + * Parse traceparent parameters + */ + errno = 0; + pgTracingTrace->trace_id = strtol(&traceparent[3], &endptr, 16); + if (endptr != traceparent + 35 || errno) + return; + pgTracingTrace->parent_id = strtol(&traceparent[36], &endptr, 16); + if (endptr != traceparent + 52 || errno) + return; + pgTracingTrace->sampled = strtol(&traceparent[53], &endptr, 16); + if (endptr != traceparent + 55 || errno) + + /* + * Just to be sure, reset sampled on error + */ + pgTracingTrace->sampled = 0; +} + + +/* + * comp_location: comparator for qsorting LocationLen structs by location + */ +static int +comp_location(const void *a, const void *b) +{ + int l = ((const LocationLen *) a)->location; + int r = ((const LocationLen *) b)->location; + + if (l < r) + return -1; + else if (l > r) + return +1; + else + return 0; +} + + +/* + * Normalise query and fill paramStr. + * Normalised query will separate tokens with a single space and + * parameters are replaced by $1, $2... + * Parameters are put in the paramStr wich will contain all parameters values + * using the format: "$1 = 0, $2 = 'v'" + */ +const char * +normalise_query_parameters(const JumbleState *jstate, const char *query, + int query_loc, int *query_len_p, char **paramStr) +{ + char *norm_query; + int query_len = *query_len_p; + int norm_query_buflen, /* Space allowed for norm_query */ + n_quer_loc = 0; + LocationLen *locs; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + int current_loc = 0; + StringInfoData buf; + + initStringInfo(&buf); + + norm_query_buflen = query_len + jstate->clocations_count * 10; + Assert(norm_query_buflen > 0); + locs = jstate->clocations; + + /* Allocate result buffer */ + norm_query = palloc(norm_query_buflen + 1); + + if (jstate->clocations_count > 1) + qsort(jstate->clocations, jstate->clocations_count, + sizeof(LocationLen), comp_location); + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query + query_loc, + &yyextra, + &ScanKeywords, + ScanKeywordTokens); + + for (;;) + { + int loc = locs[current_loc].location; + int tok; + + loc -= query_loc; + + tok = core_yylex(&yylval, &yylloc, yyscanner); + + /* + * We should not hit end-of-string, but if we do, behave sanely + */ + if (tok == 0) + break; /* out of inner for-loop */ + if (yylloc > query_len) + break; + + /* + * We should find the token position exactly, but if we somehow run + * past it, work with that. + */ + if (current_loc < jstate->clocations_count && yylloc >= loc) + { + appendStringInfo(&buf, + "%s$%d = ", + current_loc > 0 ? ", " : "", + current_loc + 1); + if (query[loc] == '-') + { + /* + * It's a negative value - this is the one and only case where + * we replace more than a single token. + * + * Do not compensate for the core system's special-case + * adjustment of location to that of the leading '-' operator + * in the event of a negative constant. It is also useful for + * our purposes to start from the minus symbol. In this way, + * queries like "select * from foo where bar = 1" and "select * + * from foo where bar = -2" will have identical normalized + * query strings. + */ + appendStringInfoChar(&buf, '-'); + tok = core_yylex(&yylval, &yylloc, yyscanner); + if (tok == 0) + break; /* out of inner for-loop */ + } + if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0) + { + norm_query[n_quer_loc++] = ' '; + } + + /* + * Append the current parameter $x in the normalised query + */ + n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d", + current_loc + 1 + jstate->highest_extern_param_id); + + appendStringInfoString(&buf, yyextra.scanbuf + yylloc); + + current_loc++; + } + else + { + int to_copy; + + if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0) + { + norm_query[n_quer_loc++] = ' '; + } + to_copy = strlen(yyextra.scanbuf + yylloc); + Assert(n_quer_loc + to_copy < norm_query_buflen + 1); + memcpy(norm_query + n_quer_loc, yyextra.scanbuf + yylloc, to_copy); + n_quer_loc += to_copy; + } + } + scanner_finish(yyscanner); + + *query_len_p = n_quer_loc; + norm_query[n_quer_loc] = '\0'; + *paramStr = buf.data; + return norm_query; +} + +/* + * Normalise query: tokens will be separated by a single space + */ +const char * +normalise_query(const char *query, int query_loc, int *query_len_p) +{ + char *norm_query; + int query_len = *query_len_p; + int norm_query_buflen = query_len; + int n_quer_loc = 0; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + + /* Allocate result buffer */ + norm_query = palloc(norm_query_buflen + 2); + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query + query_loc, &yyextra, &ScanKeywords, ScanKeywordTokens); + for (;;) + { + int tok; + int to_copy; + + tok = core_yylex(&yylval, &yylloc, yyscanner); + + if (tok == 0) + break; /* out of inner for-loop */ + if (yylloc > query_len) + break; + + if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0) + { + norm_query[n_quer_loc++] = ' '; + } + to_copy = strlen(yyextra.scanbuf + yylloc); + Assert(n_quer_loc + to_copy < norm_query_buflen + 2); + memcpy(norm_query + n_quer_loc, yyextra.scanbuf + yylloc, to_copy); + n_quer_loc += to_copy; + } + scanner_finish(yyscanner); + + *query_len_p = n_quer_loc; + norm_query[n_quer_loc] = '\0'; + return norm_query; +} + +/* + * Store text in the pg_tracing stat file + */ +bool +text_store_file(pgTracingSharedState * pg_tracing, const char *text, int text_len, + Size *query_offset) +{ + Size off; + int fd; + + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + off = s->extent; + s->extent += text_len + 1; + s->n_writers++; + SpinLockRelease(&s->mutex); + } + + /* + * Don't allow the file to grow larger than what qtext_load_file can + * (theoretically) handle. This has been seen to be reachable on 32-bit + * platforms. + */ + if (unlikely(text_len >= MaxAllocHugeSize - off)) + { + errno = EFBIG; /* not quite right, but it'll do */ + fd = -1; + goto error; + } + + /* Now write the data into the successfully-reserved part of the file */ + fd = OpenTransientFile(PG_TRACING_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + goto error; + + if (pg_pwrite(fd, text, text_len, off) != text_len) + goto error; + if (pg_pwrite(fd, "\0", 1, off + text_len) != 1) + goto error; + + CloseTransientFile(fd); + + /* Mark our write complete */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->n_writers--; + SpinLockRelease(&s->mutex); + } + + /* + * Set offset once write was succesful + */ + *query_offset = off; + + return true; + +error: + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + + if (fd >= 0) + CloseTransientFile(fd); + + /* Mark our write complete */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->n_writers--; + SpinLockRelease(&s->mutex); + } + + return false; +} + +/* + * Read the external query text file into a malloc'd buffer. + * + * Returns NULL (without throwing an error) if unable to read, eg file not + * there or insufficient memory. + * + * On success, the buffer size is also returned into *buffer_size. + * + * This can be called without any lock on pgss->lock, but in that case the + * caller is responsible for verifying that the result is sane. + */ +const char * +qtext_load_file(Size *buffer_size) +{ + char *buf; + int fd; + struct stat stat; + Size nread; + + fd = OpenTransientFile(PG_TRACING_TEXT_FILE, O_RDONLY | PG_BINARY); + if (fd < 0) + { + if (errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + return NULL; + } + + /* Get file length */ + if (fstat(fd, &stat)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + CloseTransientFile(fd); + return NULL; + } + + /* Allocate buffer; beware that off_t might be wider than size_t */ + if (stat.st_size <= MaxAllocHugeSize) + buf = (char *) malloc(stat.st_size); + else + buf = NULL; + if (buf == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Could not allocate enough memory to read file \"%s\".", + PG_TRACING_TEXT_FILE))); + CloseTransientFile(fd); + return NULL; + } + + /* + * OK, slurp in the file. Windows fails if we try to read more than + * INT_MAX bytes at once, and other platforms might not like that either, + * so read a very large file in 1GB segments. + */ + nread = 0; + while (nread < stat.st_size) + { + int toread = Min(1024 * 1024 * 1024, stat.st_size - nread); + + /* + * If we get a short read and errno doesn't get set, the reason is + * probably that garbage collection truncated the file since we did + * the fstat(), so we don't log a complaint --- but we don't return + * the data, either, since it's most likely corrupt due to concurrent + * writes from garbage collection. + */ + errno = 0; + if (read(fd, buf + nread, toread) != toread) + { + if (errno) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + free(buf); + CloseTransientFile(fd); + return NULL; + } + nread += toread; + } + + if (CloseTransientFile(fd) != 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", PG_TRACING_TEXT_FILE))); + + *buffer_size = nread; + return buf; +} diff --git a/contrib/pg_tracing/query_process.h b/contrib/pg_tracing/query_process.h new file mode 100644 index 0000000000..04328b8cf2 --- /dev/null +++ b/contrib/pg_tracing/query_process.h @@ -0,0 +1,30 @@ +#ifndef _QUERY_PROCESS_H_ +#define _QUERY_PROCESS_H_ + +#include "pg_tracing.h" +#include "nodes/queryjumble.h" +#include "parser/parse_node.h" + +/* + * Normalise query: - Comments are removed - Constants are replaced by $x - + * All tokens are separated by a single space + */ +extern const char *normalise_query_parameters(const JumbleState *jstate, const char *query, + int query_loc, int *query_len_p, char **paramStr); + +extern void extract_traceparent(pgTracingTrace * pgTracingTrace, const char *query_str, bool is_parameter); + +/* + * Normalise simple query + */ +extern const char *normalise_query(const char *query, int query_loc, int *query_len_p); +bool text_store_file(pgTracingSharedState * pg_tracing, const char *query, + int query_len, Size *query_offset); +extern const char *qtext_load_file(Size *buffer_size); +extern const char *qtext_load_file(Size *buffer_size); +extern void initialize_traceparent(pgTracingTrace * traceparent_parameter); + +/* Location of external text file */ +#define PG_TRACING_TEXT_FILE PG_STAT_TMP_DIR "/pg_tracing.stat" + +#endif diff --git a/contrib/pg_tracing/span.c b/contrib/pg_tracing/span.c new file mode 100644 index 0000000000..b44b38ce8c --- /dev/null +++ b/contrib/pg_tracing/span.c @@ -0,0 +1,295 @@ +#include "span.h" + +#include "nodes/extensible.h" +#include "common/pg_prng.h" + + +/* + * Convert a node CmdType to the matching SpanType + */ +SpanType +command_type_to_span_type(CmdType cmd_type) +{ + switch (cmd_type) + { + case CMD_SELECT: + return SPAN_NODE_SELECT; + case CMD_INSERT: + return SPAN_NODE_INSERT; + case CMD_UPDATE: + return SPAN_NODE_UPDATE; + case CMD_DELETE: + return SPAN_NODE_DELETE; + case CMD_MERGE: + return SPAN_NODE_MERGE; + case CMD_UTILITY: + return SPAN_NODE_UTILITY; + case CMD_NOTHING: + return SPAN_NODE_NOTHING; + case CMD_UNKNOWN: + return SPAN_NODE_UNKNOWN; + } + return SPAN_NODE_UNKNOWN; +} + +/* + * Initialize span fields + */ +static void +initialize_span_fields(Span * span, SpanType type, const pgTracingTrace * trace, uint64 parent_id, uint64 query_id, int nested_level) +{ + span->trace_id = trace->trace_id; + span->type = type; + + /* + * If parent id is unset, it means that there's no propagated trace + * informations from the caller. In this case, this is the top span, + * span_id == parent_id and we reuse the generated trace id for the span + * id. + */ + if (parent_id == 0) + { + span->parent_id = trace->trace_id; + span->span_id = trace->trace_id; + } + else + { + span->parent_id = parent_id; + span->span_id = pg_prng_uint64(&pg_global_prng_state); + } + span->name_offset = -1; + span->operation_name_offset = -1; + span->parameter_offset = -1; + span->sql_error_code = 0; + span->startup = 0; + span->be_pid = MyProcPid; + span->subxact_count = MyProc->subxidStatus.count; + span->start_ns_time = 0; + span->duration_ns = 0; + span->ended = false; + span->is_top_span = false; + span->query_id = query_id; + span->nested_level = nested_level; + memset(&span->node_counters, 0, sizeof(NodeCounters)); + memset(&span->plan_counters, 0, sizeof(PlanCounters)); + + /* + * Store the starting buffer for planner and process utility spans + * + * TODO: Do we need wal buffer for planner? + */ + if (type == SPAN_PLANNER || span->type == SPAN_PROCESS_UTILITY) + { + /* TODO Do we need to track wal for planner? */ + span->node_counters.buffer_usage = pgBufferUsage; + span->node_counters.wal_usage = pgWalUsage; + } +} + +/* + * Initialize span and set span starting time. + * + * A span needs a start timestamp and a duration. + * Given that some spans can have a very short duration (less than 1us), we + * need to rely on monotonic clock as much as possible to have the best precision. + * + * For that, after we've established that a query was sampled, we get both the wall clock + * and the monotonic clock at the start of a trace. They are both stored in pgTracingTrace. + * All subsequent times will use the monotonic clock. + * When we need to get the start of a span, we use the starting wall clock as a starting point + * and add the time between the two monotonic clocks. + * + * start_span is an optional nanosecond time to set for. If nothing is provided, we use the current monotonic clock. + * This parameter is mostly used when generating spans from planstate as we need to rely on the query instrumentation to find the node start. + */ +void +begin_span(Span * span, SpanType type, const pgTracingTrace * trace, uint64 parent_id, uint64 query_id, const int64 *start_span, int nested_level) +{ + int64 start_ns_time; + int64 ns_since_trace_start; + + initialize_span_fields(span, type, trace, parent_id, query_id, nested_level); + + /* If no start span is provided, get the current one */ + if (start_span == NULL) + start_ns_time = get_current_ns(); + else + start_ns_time = *start_span; + + /* We use duration to temporarily store ns at span start ns */ + span->start_ns_time = start_ns_time; + + /* Get the ns between trace start and span start */ + ns_since_trace_start = (start_ns_time - trace->start_trace.ns); + + /* Fill span starts */ + span->start = trace->start_trace.ts + ns_since_trace_start / NS_PER_US; + span->start_ns = ns_since_trace_start % NS_PER_US; +} + +/* + * Set span duration and accumulated buffers + * end_span_input is optional, if NULL is passed, we use + * the current time + */ +void +end_span(Span * span, const int64 *end_span_input) +{ + BufferUsage buffer_usage; + WalUsage wal_usage; + int64 end_span_ns; + + Assert(span->start_ns_time > 0); + Assert(span->trace_id > 0); + + /* Span should be ended only once */ + Assert(span->ended == false); + span->ended = true; + + /* Set span duration with the end time before substrating the start */ + if (end_span_input == NULL) + end_span_ns = get_current_ns(); + else + end_span_ns = *end_span_input; + span->duration_ns = end_span_ns - span->start_ns_time; + + if (span->type == SPAN_PLANNER || span->type == SPAN_PROCESS_UTILITY) + { + /* calc differences of buffer counters. */ + memset(&buffer_usage, 0, sizeof(BufferUsage)); + BufferUsageAccumDiff(&buffer_usage, &pgBufferUsage, + &span->node_counters.buffer_usage); + span->node_counters.buffer_usage = buffer_usage; + + /* calc differences of WAL counters. */ + memset(&wal_usage, 0, sizeof(wal_usage)); + WalUsageAccumDiff(&wal_usage, &pgWalUsage, + &span->node_counters.wal_usage); + span->node_counters.wal_usage = wal_usage; + } +} + +/* + * Get the name of a span. + * If it is a node span, the name may be pulled from the stat file. + */ +const char * +get_span_name(const Span * span, const char *qbuffer, Size qbuffer_size) +{ + if (span->name_offset != -1 && qbuffer_size > 0 && span->name_offset <= qbuffer_size) + return qbuffer + span->name_offset; + + switch (span->type) + { + case SPAN_POST_PARSE: + return "Post Parse"; + case SPAN_PARSE: + return "Parse"; + case SPAN_PLANNER: + return "Planner"; + case SPAN_FUNCTION: + return "Function"; + case SPAN_PROCESS_UTILITY: + return "ProcessUtility"; + case SPAN_EXECUTOR_START: + return "Executor"; + case SPAN_EXECUTOR_RUN: + return "Executor"; + case SPAN_EXECUTOR_END: + return "Executor"; + case SPAN_EXECUTOR_FINISH: + return "Executor"; + + case SPAN_NODE_INIT_PLAN: + return "InitPlan"; + case SPAN_NODE_SUBPLAN: + return "SubPlan"; + + case SPAN_NODE_SELECT: + return "Select"; + case SPAN_NODE_INSERT: + return "Insert"; + case SPAN_NODE_UPDATE: + return "Update"; + case SPAN_NODE_DELETE: + return "Delete"; + case SPAN_NODE_MERGE: + return "Merge"; + case SPAN_NODE_UTILITY: + return "Utility"; + case SPAN_NODE_NOTHING: + return "Nothing"; + case SPAN_NODE_UNKNOWN: + return "Unknown"; + case SPAN_NODE: + return "Node"; + } + return "???"; +} + +/* + * Get the operation of a span. + * For node span, the name may be pulled from the stat file. + */ +const char * +get_operation_name(const Span * span, const char *qbuffer, Size qbuffer_size) +{ + if (span->operation_name_offset != -1 && qbuffer_size > 0 && span->operation_name_offset <= qbuffer_size) + return qbuffer + span->operation_name_offset; + + switch (span->type) + { + case SPAN_POST_PARSE: + return "Post Parse"; + case SPAN_PARSE: + return "Parse"; + case SPAN_PLANNER: + return "Planner"; + case SPAN_FUNCTION: + return "Function"; + case SPAN_PROCESS_UTILITY: + return "ProcessUtility"; + case SPAN_EXECUTOR_START: + return "Start"; + case SPAN_EXECUTOR_RUN: + return "Run"; + case SPAN_EXECUTOR_END: + return "End"; + case SPAN_EXECUTOR_FINISH: + return "Finish"; + case SPAN_NODE_SELECT: + case SPAN_NODE_INSERT: + case SPAN_NODE_UPDATE: + case SPAN_NODE_DELETE: + case SPAN_NODE_MERGE: + case SPAN_NODE_UTILITY: + case SPAN_NODE_NOTHING: + case SPAN_NODE_INIT_PLAN: + case SPAN_NODE_SUBPLAN: + case SPAN_NODE_UNKNOWN: + case SPAN_NODE: + return "Node"; + } + return "Unknown type"; +} + +/* +* Get the current nanoseconds value +*/ +int64 +get_current_ns(void) +{ + instr_time t; + + INSTR_TIME_SET_CURRENT(t); + return INSTR_TIME_GET_NANOSEC(t); +} + +/* +* Get the end of a span in nanoseconds +*/ +int64 +get_span_end_ns(const Span * span) +{ + return span->start_ns_time + span->duration_ns; +} diff --git a/contrib/pg_tracing/span.h b/contrib/pg_tracing/span.h new file mode 100644 index 0000000000..e9df64862f --- /dev/null +++ b/contrib/pg_tracing/span.h @@ -0,0 +1,159 @@ +/*------------------------------------------------------------------------- + * contrib/pg_tracing/span.h + * + * Header for span. + * + * IDENTIFICATION + * contrib/pg_tracing/span.h + * + *------------------------------------------------------------------------- + */ +#ifndef _SPAN_H_ +#define _SPAN_H_ + +#include "postgres.h" + +#include "jit/jit.h" +#include "pgstat.h" +#include "access/transam.h" + +/* + * SpanType: Type of the span + */ +typedef enum +{ + SPAN_PARSE, /* Wraps query parsing */ + SPAN_POST_PARSE, /* Wraps post parsing */ + SPAN_PLANNER, /* Wraps planner execution in planner hook */ + SPAN_FUNCTION, /* Wraps function in fmgr hook */ + SPAN_PROCESS_UTILITY, /* Wraps ProcessUtility execution */ + + SPAN_EXECUTOR_START, /* Executor Spans wrapping the matching hooks */ + SPAN_EXECUTOR_RUN, + SPAN_EXECUTOR_END, + SPAN_EXECUTOR_FINISH, + + SPAN_NODE_INIT_PLAN, + SPAN_NODE_SUBPLAN, + + SPAN_NODE, /* Represents a node execution, generated from + * planstate */ + + SPAN_NODE_SELECT, /* Query Span types. They are created from the + * query cmdType */ + SPAN_NODE_INSERT, + SPAN_NODE_UPDATE, + SPAN_NODE_DELETE, + SPAN_NODE_MERGE, + SPAN_NODE_UTILITY, + SPAN_NODE_NOTHING, + SPAN_NODE_UNKNOWN, +} SpanType; + + +/* + * Counters extracted from query instrumentation + */ +typedef struct NodeCounters +{ + int64 rows; /* # of tuples processed */ + int64 nloops; /* # of cycles for this node */ + + BufferUsage buffer_usage; /* total buffer usage for this node */ + WalUsage wal_usage; /* total WAL usage for this node */ + JitInstrumentation jit_usage; /* total JIT usage for this node */ +} NodeCounters; + +/* + * Counters extracted from query's plan + */ +typedef struct PlanCounters +{ + /* + * estimated execution costs for plan (see costsize.c for more info) + */ + Cost startup_cost; /* cost expended before fetching any tuples */ + Cost total_cost; /* total cost (assuming all tuples fetched) */ + + /* + * planner's estimate of result size of this plan step + */ + Cardinality plan_rows; /* number of rows plan is expected to emit */ + int plan_width; /* average row width in bytes */ +} PlanCounters; + +/* + * Start time of a trace. We store both wall clock and monotonic clock. + */ +typedef struct +{ + TimestampTz ts; /* Wall clock at the start of the trace */ + int64 ns; /* Monotonic clock at the start of the trace */ +} pgTracingStartTime; + +typedef struct pgTracingTrace +{ + uint64 trace_id; /* Id of the trace */ + uint64 parent_id; /* Span id of the parent */ + int sampled; /* Is current statement sampled? */ + uint64 query_id; /* Query id of the current statement */ + pgTracingStartTime start_trace; +} pgTracingTrace; + +/* + * The Span data structure represents an operation with a start, a duration + * and metadatas. + */ +typedef struct Span +{ + uint64 trace_id; /* Trace id extracted from the SQLCommenter's + * traceparent */ + uint64 span_id; /* Span Identifier generated from a random + * uint64 */ + uint64 parent_id; /* Span's parent id. For the top span, it's + * extracted from SQLCommenter's traceparent. + * For other spans, we pass the parent's span. */ + uint64 query_id; /* QueryId of the trace query if available */ + + TimestampTz start; /* Start of the span. */ + uint16 start_ns; /* Leftover nanoseconds of span start */ + int64 start_ns_time; /* Nanoseconds at start of the span. */ + + int64 duration_ns; /* Duration of the span in nanoseconds. */ + SpanType type; /* Type of the span. Used to generate the + * span's name for all spans except SPAN_NODE. */ + int be_pid; /* Pid of the backend process */ + bool is_top_span; + bool ended; /* Track if the span was already ended */ + uint8 nested_level; /* Only used for debugging */ + uint8 subxact_count; /* Active count of backend's subtransaction */ + + /* + * We store variable size metadata in an external file. Those represent + * the position of NULL terminated strings in the file. Set to -1 if + * unused. + */ + Size name_offset; /* span name offset in external file */ + Size operation_name_offset; /* operation name offset in external + * file */ + Size parameter_offset; /* parameters offset in external file */ + + PlanCounters plan_counters; /* Counters with plan costs */ + NodeCounters node_counters; /* Counters with node costs (jit, wal, + * buffers) */ + int64 startup; /* Time to the first tuple */ + int sql_error_code; /* query error code extracted from ErrorData, + * 0 if query was successful */ +} Span; + +extern void begin_span(Span * span, SpanType type, const pgTracingTrace * trace, uint64 parent_id, uint64 query_id, const int64 *start_span, int nested_level); +extern void end_span(Span * span, const int64 *end_time); + +extern SpanType command_type_to_span_type(CmdType cmd_type); +extern const char *get_span_name(const Span * span, const char *qbuffer, Size qbuffer_size); +extern const char *get_operation_name(const Span * span, const char *qbuffer, Size qbuffer_size); + +extern int64 get_current_ns(void); +extern int64 get_span_end_ns(const Span * span); + +#endif diff --git a/contrib/pg_tracing/sql/cleanup.sql b/contrib/pg_tracing/sql/cleanup.sql new file mode 100644 index 0000000000..beb88da53f --- /dev/null +++ b/contrib/pg_tracing/sql/cleanup.sql @@ -0,0 +1,4 @@ +-- Drop test table and test functions +DROP TABLE pg_tracing_test; +DROP function test_function; +DROP EXTENSION pg_tracing; diff --git a/contrib/pg_tracing/sql/nested.sql b/contrib/pg_tracing/sql/nested.sql new file mode 100644 index 0000000000..bd922b44c4 --- /dev/null +++ b/contrib/pg_tracing/sql/nested.sql @@ -0,0 +1,102 @@ +-- Create test function to sample +CREATE OR REPLACE FUNCTION test_function(a int) RETURNS SETOF oid AS +$BODY$ +BEGIN + RETURN QUERY SELECT oid from pg_class where oid = a; +END; +$BODY$ +LANGUAGE plpgsql; + +-- Trace a statement with a function call +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000051-0000000000000051-01'*/ select test_function(1); + +-- The test function call will generate the following spans (non exhaustive list): +-- +-----------------------------------------------------------------------------------------------+ +-- | A: Select test_function(1); | +-- +-+--------++----------++----------------++---------------------------------------------------+-+ +-- |B: Parse||C: Planner||D: ExecutorStart||E: ExecutorRun | +-- +--------++----------++----------------+++-------------------------------------------------++ +-- |F: ProjectSet | +-- ++---------+------+------------------------------++ +-- |G: Result| | H: Select a from b where... | +-- +---------+ +----+--------------+----------+ +-- |I: ExecutorRun| +-- +--------------+ + +-- Gather span_id, span start and span end of function call statement +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=81 and name!='Parse' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_a_id and name='Executor' and resource='Run' \gset +SELECT span_id AS span_f_id, + get_span_start_ns(span_start, span_start_ns) as span_f_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_f_end + from pg_tracing_peek_spans where parent_id=:span_e_id and name='ProjectSet' \gset +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_f_id and name='Result' \gset +SELECT span_id AS span_h_id, + get_span_start_ns(span_start, span_start_ns) as span_h_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_h_end + from pg_tracing_peek_spans where parent_id=:span_f_id and name='Select' \gset +SELECT span_id AS span_i_id, + get_span_start_ns(span_start, span_start_ns) as span_i_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_i_end + from pg_tracing_peek_spans where parent_id=:span_h_id and resource='Run' \gset + +-- Check that spans' start and end are within expection +SELECT :span_a_start < :span_e_start AS top_query_before_run, + :span_a_end >= :span_e_end AS top_end_after_run_end, + + :span_e_start <= :span_f_start AS top_run_start_before_project, + + :span_e_end >= :span_f_end AS top_run_end_after_project_end, + :span_e_end >= :span_h_end AS top_run_end_before_select_end, + :span_e_end >= :span_i_end AS top_run_end_after_nested_run_end, + + :span_i_end <= :span_h_end AS run_end_after_select_end; + +-- Check that the root span is the longest one +WITH max_duration AS (select max(duration) from pg_tracing_peek_spans) +SELECT duration = max_duration.max from pg_tracing_peek_spans, max_duration + where span_id = :span_a_id; + +-- Check that ExecutorRun is attached to the nested top span +SELECT resource from pg_tracing_peek_spans where parent_id=:span_i_id order by resource; + +-- Check tracking with top tracking +set pg_tracing.track = 'top'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000052-0000000000000052-01'*/ select test_function(1); +SELECT count(*) from pg_tracing_consume_spans where trace_id=82; + +-- Check tracking with no tracking +set pg_tracing.track = 'none'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000053-0000000000000053-01'*/ select test_function(1); +SELECT count(*) from pg_tracing_consume_spans where trace_id=83; + +-- Reset tracking setting +set pg_tracing.track = 'all'; + +-- Create test procedure +CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ +DECLARE + r int; +BEGIN + SELECT (i + i)::int INTO r; +END; $$ LANGUAGE plpgsql; + +-- Test tracking of procedure with utility tracking enabled +set pg_tracing.track_utility=on; +/*traceparent='00-00000000000000000000000000000054-0000000000000054-01'*/ CALL sum_one(3); +SELECT resource from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + + +-- Test again with utility tracking disabled +set pg_tracing.track_utility=off; +/*traceparent='00-00000000000000000000000000000055-0000000000000055-01'*/ CALL sum_one(10); +SELECT resource from pg_tracing_consume_spans order by span_start, span_start_ns, resource; diff --git a/contrib/pg_tracing/sql/parallel.sql b/contrib/pg_tracing/sql/parallel.sql new file mode 100644 index 0000000000..0390a20b33 --- /dev/null +++ b/contrib/pg_tracing/sql/parallel.sql @@ -0,0 +1,16 @@ +begin; +-- encourage use of parallel plans +set parallel_setup_cost=0; +set parallel_tuple_cost=0; +set min_parallel_table_scan_size=0; +set max_parallel_workers_per_gather=2; + +-- Trace parallel queries +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ select 1 from pg_class limit 1; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ select 2 from pg_class limit 2; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-00'*/ select 3 from pg_class limit 3; +commit; + +-- Check order and number of spans generated by parallel queries +-- We can't order by span start due to the parallelisation introducing randomness +SELECT name, resource from pg_tracing_consume_spans order by resource; diff --git a/contrib/pg_tracing/sql/reset.sql b/contrib/pg_tracing/sql/reset.sql new file mode 100644 index 0000000000..531db2172d --- /dev/null +++ b/contrib/pg_tracing/sql/reset.sql @@ -0,0 +1,3 @@ +-- Check reset is working +SELECT pg_tracing_reset(); +SELECT traces from pg_tracing_info; diff --git a/contrib/pg_tracing/sql/sample.sql b/contrib/pg_tracing/sql/sample.sql new file mode 100644 index 0000000000..bdfb59ad70 --- /dev/null +++ b/contrib/pg_tracing/sql/sample.sql @@ -0,0 +1,41 @@ +-- Trace nothing +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 0.0; + +-- Query with sampling flag +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000021-0000000000000021-01'*/ SELECT 1; +-- Query without trace context +SELECT 1; + +-- No spans should have been generated +select count(distinct(trace_id)) from pg_tracing_peek_spans; + +-- Enable full sampling +SET pg_tracing.sample_rate = 1.0; + +-- Generate queries with sampling flag on, off and no trace context +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000022-0000000000000022-01'*/ SELECT 1; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000023-0000000000000023-00'*/ SELECT 2; +SELECT 3; +SELECT 4; + +-- Check number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; +-- Check span order +select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; +-- Top spans should reuse generated ids and have trace_id = parent_id = span_id +select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns; + +-- Only trace query with sampled flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; + +-- Generate queries with sampling flag on, off and no trace context +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000024-0000000000000024-01'*/ SELECT 1; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000025-0000000000000025-00'*/ SELECT 2; +SELECT 1; + +-- Check number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; +-- Check span ordering +select resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns; diff --git a/contrib/pg_tracing/sql/select.sql b/contrib/pg_tracing/sql/select.sql new file mode 100644 index 0000000000..9b8d6b5fb0 --- /dev/null +++ b/contrib/pg_tracing/sql/select.sql @@ -0,0 +1,81 @@ +-- Only trace queries with sample flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; + +-- Run a simple query +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ SELECT 1; + +-- Get top span id +SELECT span_id AS top_span_id from pg_tracing_peek_spans where parent_id=1 and name!='Parse' \gset + +-- Check parameters +SELECT parameters from pg_tracing_peek_spans where span_id=:top_span_id; + +-- Check the number of children +SELECT count(*) from pg_tracing_peek_spans where parent_id=:'top_span_id'; + +-- Check resource and query id +SELECT name, resource, query_id from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, resource; + +-- Check reported number of trace +SELECT traces from pg_tracing_info; + +-- Trace a statement with function call +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ SELECT count(*) from current_database(); + +-- Check the generated span name, resource and order of function call +SELECT name, resource from pg_tracing_consume_spans where trace_id=3 order by resource; + +-- Trace a more complex query with multiple function calls +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000004-0000000000000004-01'*/ SELECT s.relation_size + s.index_size +FROM (SELECT + pg_relation_size(C.oid) as relation_size, + pg_indexes_size(C.oid) as index_size + FROM pg_class C) as s limit 1; + +-- Check the generated span name, resource and order of query with multiple function calls +SELECT name, resource from pg_tracing_consume_spans where trace_id=4 order by resource; + +-- Check that we're in a correct state after a timeout +set statement_timeout=200; + +-- Trace query triggering a statement timeout +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000007-0000000000000007-01'*/ select * from pg_sleep(10); +-- Trace a working query after the timeout to check we're in a consistent state +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000008-0000000000000008-01'*/ select 1; + +-- Check the spans order and error code +SELECT name, resource, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + +-- Cleanup statement setting +set statement_timeout=0; + +-- Create a prepare statement with comment passed as first parameter +PREPARE test_prepared (text, integer) AS /*$1*/ SELECT $2; +-- Execute prepare statement with trace context passed as a parameter +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', 1); + +-- Check generated spans and order +SELECT trace_id, name, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +-- Execute prepare statement with trace context passed as a parameter and generic plan +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000010-0000000000000010-01''', 10); + +-- Check spans are generated even through generic plan +SELECT trace_id, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + +-- Run a statement with node not executed +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000a-000000000000000a-01'*/ select 1 limit 0; +-- Not executed node should not generate any spans +SELECT trace_id, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource; + +-- Test multiple statements in a single query +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000011-0000000000000012-01'*/ select 1; select 2; +-- Not executed node should not generate any spans +select resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns; + +-- Cleanup +SET plan_cache_mode='auto'; +DEALLOCATE test_prepared; diff --git a/contrib/pg_tracing/sql/subxact.sql b/contrib/pg_tracing/sql/subxact.sql new file mode 100644 index 0000000000..89082f523b --- /dev/null +++ b/contrib/pg_tracing/sql/subxact.sql @@ -0,0 +1,18 @@ +-- Create test table +CREATE TABLE IF NOT EXISTS pg_tracing_test (a int, b char(20)); + +-- Enable full sampling +SET pg_tracing.sample_rate = 1.0; + +-- Start a transaction with subxaction +BEGIN; +SAVEPOINT s1; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s2; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s3; +SELECT 1; +COMMIT; + +-- Check that subxact_count is correctly reported +select resource, parameters, subxact_count from pg_tracing_consume_spans order by span_start, span_start_ns; diff --git a/contrib/pg_tracing/sql/utility.sql b/contrib/pg_tracing/sql/utility.sql new file mode 100644 index 0000000000..7415232e52 --- /dev/null +++ b/contrib/pg_tracing/sql/utility.sql @@ -0,0 +1,128 @@ +-- Some utility functions +CREATE OR REPLACE FUNCTION get_span_start_ns(time_start timestamptz, start_ns smallint) RETURNS bigint AS +$BODY$ +BEGIN + RETURN extract(epoch from time_start) * 1000000000 + start_ns; +END; +$BODY$ +LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION get_span_end_ns(time_start timestamptz, start_ns smallint, duration bigint) RETURNS bigint AS +$BODY$ +BEGIN + RETURN get_span_start_ns(time_start, start_ns) + duration; +END; +$BODY$ +LANGUAGE plpgsql; + +-- Create pg_tracing extension with sampling on +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; + +-- This will create the following spans (non exhaustive list): +-- +-- +-------------------------------------------------------------------------------------+ +-- | A: Utility: Create Extension | +-- +-+-----------------------------------------------------------------------------------+ +-- +----------------------------------------------------------------------------------+ +-- |B: ProcessUtility: Create Extension | +-- +---+-----------------------------------+---+--------------------------------------+ +-- +-----------------------------------+ +-------------------------------------+ +-- |C: Utility: Create Function1 | |E: Utility: Create Function2 | +-- ++----------------------------------+ ++-----------------------------------++ +-- +----------------------------------+ +-----------------------------------+ +-- |D: ProcessUtility: Create Function| |F: ProcessUtility: Create Function2| +-- +----------------------------------+ +-----------------------------------+ + +-- Extract span_ids, start and end of those spans +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=1 and name='Utility' \gset + +SELECT span_id AS span_b_id, + get_span_start_ns(span_start, span_start_ns) as span_b_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_b_end + from pg_tracing_peek_spans where parent_id=:span_a_id and name='ProcessUtility' \gset + +SELECT span_id AS span_c_id, + get_span_start_ns(span_start, span_start_ns) as span_c_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_c_end + from pg_tracing_peek_spans where parent_id=:span_b_id and name='Utility' limit 1 \gset + +SELECT span_id AS span_d_id, + get_span_start_ns(span_start, span_start_ns) as span_d_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_d_end + from pg_tracing_peek_spans where parent_id=:span_c_id and name='ProcessUtility' \gset + +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_b_id and name='Utility' limit 1 offset 1 \gset + +-- Check that the start and end of those spans are within expectation +SELECT :span_a_start < :span_b_start AS span_a_start_first, + :span_a_end >= :span_b_end AS span_a_end_last, + + :span_d_end <= :span_c_end AS nested_span_ends_before_parent, + :span_c_end <= :span_e_start AS next_utility_start_after; + +-- Clean current spans +select count(*) from pg_tracing_consume_spans; + +-- +-- Test that no utility is captured with track_utility off +-- + +-- Set utility off +SET pg_tracing.track_utility = off; + +-- Prepare and execute a prepared statement +PREPARE test_prepared_one_param (integer) AS SELECT $1; +EXECUTE test_prepared_one_param(100); + +-- Nothing should be generated +select count(*) from pg_tracing_consume_spans; + +-- Force a query to start from ExecutorRun +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + +-- Again, nothing should be generated +select count(*) from pg_tracing_consume_spans; + +-- +-- Test that no utility is captured with track_utility off +-- + +-- Enable utility tracking and track everything +SET pg_tracing.track_utility = on; +SET pg_tracing.sample_rate = 1.0; + +-- Prepare and execute a prepared statement +PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; +EXECUTE test_prepared_one_param_2(100); + +-- Check the number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; +-- Check that the spans are in correct order +select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; +-- Check the top span (standalone top span has trace_id=parent_id=span_id) +select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns; + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + +-- Check the number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; +-- Check that the spans are in correct order +select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; +-- Check the top span (standalone top span has trace_id=parent_id=span_id) +select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns; + +-- Second create extension should generate an error that is captured by span +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; +select resource, parameters, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns; + +-- Cleanup +SET plan_cache_mode='auto'; diff --git a/contrib/pg_tracing/sql/wal.sql b/contrib/pg_tracing/sql/wal.sql new file mode 100644 index 0000000000..c83b287ee2 --- /dev/null +++ b/contrib/pg_tracing/sql/wal.sql @@ -0,0 +1,32 @@ +-- Create test table +CREATE TABLE IF NOT EXISTS pg_tracing_test (a int, b char(20)); + +-- Enable wal instrumentation +set pg_tracing.instrument_wal = true; + +-- Generate queries with wal write +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; + +-- Check WAL is generated for the above statements +SELECT trace_id, name, resource, + wal_records > 0 as wal_records, + wal_bytes > 0 as wal_bytes +FROM pg_tracing_consume_spans order by span_start, span_start_ns, resource; + +-- Redo the same but without wal instrumentation +set pg_tracing.instrument_wal = false; + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; + +-- With wal instrumentation disabled, wal counters should be set to 0 +SELECT trace_id, name, resource, + wal_records = 0 as wal_records, + wal_bytes = 0 as wal_bytes +FROM pg_tracing_consume_spans order by span_start, span_start_ns, resource; + +-- Cleanup +set pg_tracing.instrument_wal = true; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index ee78a5749d..1174e2450f 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -70,6 +70,7 @@ InstrStartNode(Instrumentation *instr) if (instr->need_timer && !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); + INSTR_TIME_SET_CURRENT_LAZY(instr->firsttime); /* save buffer usage totals at node entry, if needed */ if (instr->need_bufusage) diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 87e5e2183b..13840be1d7 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -72,6 +72,7 @@ typedef struct Instrumentation bool async_mode; /* true if node is in async mode */ /* Info about current plan cycle: */ bool running; /* true if we've completed first tuple */ + instr_time firsttime; /* start time of this node */ instr_time starttime; /* start time of current iteration of node */ instr_time counter; /* accumulated runtime for this node */ double firsttuple; /* time for first tuple of this cycle */ -- 2.39.3 (Apple Git-145)