From d9aaecd64d37c8d3684d58d52431146010328713 Mon Sep 17 00:00:00 2001 From: Anthonin Bonnefoy Date: Mon, 15 May 2023 10:00:08 +0200 Subject: [PATCH v10] Add pg_tracing extension to contrib This extension add the possibility to create server-side spans for distributed tracing. When a query is sampled, the extension will create spans tracking the process of this query: Parse, Planner, Executor... During ExecutorEnd, the planstate is also converted in spans, effectively generating an explain analyze in a trace format. The spans are exposed through the pg_tracing_peek_spans and pg_tracing_consume_spans views. Sending spans to a trace collector is outside this extension's scope. --- contrib/Makefile | 1 + contrib/meson.build | 1 + contrib/pg_tracing/Makefile | 28 + contrib/pg_tracing/expected/cleanup.out | 5 + contrib/pg_tracing/expected/extended.out | 193 ++ contrib/pg_tracing/expected/insert.out | 64 + contrib/pg_tracing/expected/nested.out | 317 ++ contrib/pg_tracing/expected/parallel.out | 55 + contrib/pg_tracing/expected/reset.out | 13 + contrib/pg_tracing/expected/sample.out | 162 ++ contrib/pg_tracing/expected/select.out | 350 +++ contrib/pg_tracing/expected/subxact.out | 73 + contrib/pg_tracing/expected/trigger.out | 271 ++ contrib/pg_tracing/expected/utility.out | 243 ++ contrib/pg_tracing/expected/wal.out | 97 + contrib/pg_tracing/meson.build | 56 + contrib/pg_tracing/pg_tracing--1.0.sql | 102 + contrib/pg_tracing/pg_tracing.c | 2560 +++++++++++++++++ contrib/pg_tracing/pg_tracing.conf | 1 + contrib/pg_tracing/pg_tracing.control | 5 + contrib/pg_tracing/pg_tracing.h | 43 + contrib/pg_tracing/pg_tracing_explain.c | 552 ++++ contrib/pg_tracing/pg_tracing_explain.h | 21 + contrib/pg_tracing/pg_tracing_parallel.c | 103 + contrib/pg_tracing/pg_tracing_parallel.h | 44 + contrib/pg_tracing/pg_tracing_query_process.c | 478 +++ contrib/pg_tracing/pg_tracing_query_process.h | 30 + contrib/pg_tracing/pg_tracing_span.c | 307 ++ contrib/pg_tracing/pg_tracing_span.h | 167 ++ contrib/pg_tracing/sql/cleanup.sql | 5 + contrib/pg_tracing/sql/extended.sql | 44 + contrib/pg_tracing/sql/insert.sql | 26 + contrib/pg_tracing/sql/nested.sql | 204 ++ contrib/pg_tracing/sql/parallel.sql | 26 + contrib/pg_tracing/sql/reset.sql | 3 + contrib/pg_tracing/sql/sample.sql | 41 + contrib/pg_tracing/sql/select.sql | 113 + contrib/pg_tracing/sql/subxact.sql | 15 + contrib/pg_tracing/sql/trigger.sql | 204 ++ contrib/pg_tracing/sql/utility.sql | 144 + contrib/pg_tracing/sql/wal.sql | 29 + doc/src/sgml/contrib.sgml | 1 + doc/src/sgml/filelist.sgml | 1 + doc/src/sgml/pgtracing.sgml | 1035 +++++++ src/backend/executor/instrument.c | 8 +- src/include/executor/instrument.h | 1 + 46 files changed, 8239 insertions(+), 3 deletions(-) create mode 100644 contrib/pg_tracing/Makefile create mode 100644 contrib/pg_tracing/expected/cleanup.out create mode 100644 contrib/pg_tracing/expected/extended.out create mode 100644 contrib/pg_tracing/expected/insert.out create mode 100644 contrib/pg_tracing/expected/nested.out create mode 100644 contrib/pg_tracing/expected/parallel.out create mode 100644 contrib/pg_tracing/expected/reset.out create mode 100644 contrib/pg_tracing/expected/sample.out create mode 100644 contrib/pg_tracing/expected/select.out create mode 100644 contrib/pg_tracing/expected/subxact.out create mode 100644 contrib/pg_tracing/expected/trigger.out create mode 100644 contrib/pg_tracing/expected/utility.out create mode 100644 contrib/pg_tracing/expected/wal.out create mode 100644 contrib/pg_tracing/meson.build create mode 100644 contrib/pg_tracing/pg_tracing--1.0.sql create mode 100644 contrib/pg_tracing/pg_tracing.c create mode 100644 contrib/pg_tracing/pg_tracing.conf create mode 100644 contrib/pg_tracing/pg_tracing.control create mode 100644 contrib/pg_tracing/pg_tracing.h create mode 100644 contrib/pg_tracing/pg_tracing_explain.c create mode 100644 contrib/pg_tracing/pg_tracing_explain.h create mode 100644 contrib/pg_tracing/pg_tracing_parallel.c create mode 100644 contrib/pg_tracing/pg_tracing_parallel.h create mode 100644 contrib/pg_tracing/pg_tracing_query_process.c create mode 100644 contrib/pg_tracing/pg_tracing_query_process.h create mode 100644 contrib/pg_tracing/pg_tracing_span.c create mode 100644 contrib/pg_tracing/pg_tracing_span.h create mode 100644 contrib/pg_tracing/sql/cleanup.sql create mode 100644 contrib/pg_tracing/sql/extended.sql create mode 100644 contrib/pg_tracing/sql/insert.sql create mode 100644 contrib/pg_tracing/sql/nested.sql create mode 100644 contrib/pg_tracing/sql/parallel.sql create mode 100644 contrib/pg_tracing/sql/reset.sql create mode 100644 contrib/pg_tracing/sql/sample.sql create mode 100644 contrib/pg_tracing/sql/select.sql create mode 100644 contrib/pg_tracing/sql/subxact.sql create mode 100644 contrib/pg_tracing/sql/trigger.sql create mode 100644 contrib/pg_tracing/sql/utility.sql create mode 100644 contrib/pg_tracing/sql/wal.sql create mode 100644 doc/src/sgml/pgtracing.sgml diff --git a/contrib/Makefile b/contrib/Makefile index da4e2316a3..721a58fcde 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -36,6 +36,7 @@ SUBDIRS = \ pg_prewarm \ pg_stat_statements \ pg_surgery \ + pg_tracing \ pg_trgm \ pgrowlocks \ pgstattuple \ diff --git a/contrib/meson.build b/contrib/meson.build index c0b267c632..05d1a3c34e 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -52,6 +52,7 @@ subdir('pgrowlocks') subdir('pg_stat_statements') subdir('pgstattuple') subdir('pg_surgery') +subdir('pg_tracing') subdir('pg_trgm') subdir('pg_visibility') subdir('pg_walinspect') diff --git a/contrib/pg_tracing/Makefile b/contrib/pg_tracing/Makefile new file mode 100644 index 0000000000..5b34e6f207 --- /dev/null +++ b/contrib/pg_tracing/Makefile @@ -0,0 +1,28 @@ +# contrib/pg_tracing/Makefile + +MODULE_big = pg_tracing +OBJS = \ + $(WIN32RES) \ + pg_tracing.o \ + pg_tracing_query_process.o \ + pg_tracing_explain.o \ + pg_tracing_parallel.o \ + pg_tracing_span.o + +EXTENSION = pg_tracing +DATA = pg_tracing--1.0.sql +PGFILEDESC = "pg_tracing - Distributed tracing for postgres" + +REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_tracing/pg_tracing.conf +REGRESS = utility select extended insert trigger sample subxact reset nested wal parallel cleanup + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_tracing +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_tracing/expected/cleanup.out b/contrib/pg_tracing/expected/cleanup.out new file mode 100644 index 0000000000..7ac2eb43ef --- /dev/null +++ b/contrib/pg_tracing/expected/cleanup.out @@ -0,0 +1,5 @@ +-- Drop test table and test functions +DROP TABLE pg_tracing_test; +DROP function test_function_project_set; +DROP function test_function_result; +DROP EXTENSION pg_tracing; diff --git a/contrib/pg_tracing/expected/extended.out b/contrib/pg_tracing/expected/extended.out new file mode 100644 index 0000000000..8902d8e95c --- /dev/null +++ b/contrib/pg_tracing/expected/extended.out @@ -0,0 +1,193 @@ +-- Trace everything +SET pg_tracing.sample_rate = 1.0; +-- Simple query with extended protocol +SELECT $1, $2 \bind 1 2 \g + ?column? | ?column? +----------+---------- + 1 | 2 +(1 row) + +-- Check generated spans +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_type | span_operation | parameters +----------------+-----------------------------------+-------------------- + Parse | Parse | + Post Parse | Post Parse | + Utility | SET pg_tracing.sample_rate = 0.0; | + ProcessUtility | ProcessUtility | + Parse | Parse | + Post Parse | Post Parse | + Select | SELECT $1, $2 | $1 = '1', $2 = '2' + Planner | Planner | + Executor | ExecutorStart | + Executor | ExecutorRun | + Result | Result | + Executor | ExecutorFinish | + Executor | ExecutorEnd | +(13 rows) + +-- Trigger an error due to mismatching number of parameters +BEGIN; select $1 \bind \g +ERROR: bind message supplies 0 parameters, but prepared statement "" requires 1 +ROLLBACK; +-- Check that no spans were generated +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_type | span_operation | parameters +----------------+----------------+------------ + Parse | Parse | + Utility | BEGIN; | + Post Parse | Post Parse | + ProcessUtility | ProcessUtility | + Parse | Parse | + Post Parse | Post Parse | + Utility | ROLLBACK; | + ProcessUtility | ProcessUtility | +(8 rows) + +-- Execute queries with extended protocol within an explicit transaction +BEGIN; +SELECT $1 \bind 1 \g + ?column? +---------- + 1 +(1 row) + +SELECT $1, $2 \bind 2 3 \g + ?column? | ?column? +----------+---------- + 2 | 3 +(1 row) + +COMMIT; +-- Spans within the same transaction should have been generated with the same trace_id +SELECT count(distinct(trace_id)) = 1 FROM pg_tracing_peek_spans; + ?column? +---------- + t +(1 row) + +-- Check generated spans order +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_type | span_operation | parameters +----------------+----------------+-------------------- + Parse | Parse | + Utility | BEGIN; | + Post Parse | Post Parse | + ProcessUtility | ProcessUtility | + Parse | Parse | + Post Parse | Post Parse | + Select | SELECT $1 | $1 = '1' + Planner | Planner | + Executor | ExecutorStart | + Executor | ExecutorRun | + Result | Result | + Parse | Parse | + Post Parse | Post Parse | + Select | SELECT $1, $2 | $1 = '2', $2 = '3' + Executor | ExecutorFinish | + Executor | ExecutorEnd | + Planner | Planner | + Executor | ExecutorStart | + Executor | ExecutorRun | + Result | Result | + Parse | Parse | + Utility | COMMIT; | + Post Parse | Post Parse | + Executor | ExecutorFinish | + Executor | ExecutorEnd | + ProcessUtility | ProcessUtility | +(26 rows) + +-- Mix extended protocol and simple protocol +BEGIN; +SELECT $1 \bind 1 \g + ?column? +---------- + 1 +(1 row) + +SELECT 5, 6, 7; + ?column? | ?column? | ?column? +----------+----------+---------- + 5 | 6 | 7 +(1 row) + +SELECT $1, $2 \bind 2 3 \g + ?column? | ?column? +----------+---------- + 2 | 3 +(1 row) + +COMMIT; +-- Spans within the same transaction should have been generated with the same trace_id +SELECT count(distinct(trace_id)) = 1 FROM pg_tracing_peek_spans; + ?column? +---------- + t +(1 row) + +-- Check generated spans order +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_type | span_operation | parameters +----------------+--------------------+------------------------ + Parse | Parse | + Utility | BEGIN; | + Post Parse | Post Parse | + ProcessUtility | ProcessUtility | + Parse | Parse | + Post Parse | Post Parse | + Select | SELECT $1 | $1 = '1' + Planner | Planner | + Executor | ExecutorStart | + Executor | ExecutorRun | + Result | Result | + Parse | Parse | + Post Parse | Post Parse | + Select | SELECT $1, $2, $3; | $1 = 5, $2 = 6, $3 = 7 + Planner | Planner | + Executor | ExecutorFinish | + Executor | ExecutorEnd | + Executor | ExecutorStart | + Executor | ExecutorRun | + Result | Result | + Executor | ExecutorFinish | + Executor | ExecutorEnd | + Parse | Parse | + Post Parse | Post Parse | + Select | SELECT $1, $2 | $1 = '2', $2 = '3' + Planner | Planner | + Executor | ExecutorStart | + Executor | ExecutorRun | + Result | Result | + Parse | Parse | + Utility | COMMIT; | + Post Parse | Post Parse | + Executor | ExecutorFinish | + Executor | ExecutorEnd | + ProcessUtility | ProcessUtility | +(35 rows) + +-- gdesc calls a single parse command then execute a query. Make sure we handle this case +\gdesc + Column | Type +----------------+------ + span_type | text + span_operation | text + parameters | text +(3 rows) + +-- Checking gdesc generated spans +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_type | span_operation | parameters +------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------ + Parse | Parse | + Post Parse | Post Parse | + Select | SELECT name AS "Column", pg_catalog.format_type(tp, tpm) AS "Type"FROM (VALUES ($1, $2::pg_catalog.oid,$3),($4, $5::pg_catalog.oid,$6),($7, $8::pg_catalog.oid,$9)) s(name, tp, tpm) | $1 = 'span_type', $2 = '25', $3 = -1, $4 = 'span_operation', $5 = '25', $6 = -1, $7 = 'parameters', $8 = '25', $9 = -1 + Planner | Planner | + Executor | ExecutorStart | + Executor | ExecutorRun | + ValuesScan | ValuesScan on "*VALUES*" | + Executor | ExecutorFinish | + Executor | ExecutorEnd | +(9 rows) + diff --git a/contrib/pg_tracing/expected/insert.out b/contrib/pg_tracing/expected/insert.out new file mode 100644 index 0000000000..38b0685918 --- /dev/null +++ b/contrib/pg_tracing/expected/insert.out @@ -0,0 +1,64 @@ +-- Only trace queries with sample flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE TABLE IF NOT EXISTS pg_tracing_test_table_with_constraint (a int, b char(20), CONSTRAINT PK_tracing_test PRIMARY KEY (a)); +-- Check create statement spans +SELECT span_type, span_operation from pg_tracing_consume_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + span_type | span_operation +----------------+----------------------------------------------------------------------------------------------------------------------------------- + Parse | Parse + Utility | CREATE TABLE IF NOT EXISTS pg_tracing_test_table_with_constraint (a int, b char(20), CONSTRAINT PK_tracing_test PRIMARY KEY (a)); + Post Parse | Post Parse + ProcessUtility | ProcessUtility + Utility | CREATE TABLE IF NOT EXISTS pg_tracing_test_table_with_constraint (a int, b char(20), CONSTRAINT PK_tracing_test PRIMARY KEY (a)); + ProcessUtility | ProcessUtility + Utility | CREATE TABLE IF NOT EXISTS pg_tracing_test_table_with_constraint (a int, b char(20), CONSTRAINT PK_tracing_test PRIMARY KEY (a)); + ProcessUtility | ProcessUtility +(8 rows) + +-- Simple insertion +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test_table_with_constraint VALUES(1, 'aaa'); +-- Check insert spans +SELECT span_type, span_operation from pg_tracing_consume_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + span_type | span_operation +------------+------------------------------------------------------------------- + Parse | Parse + Insert | INSERT INTO pg_tracing_test_table_with_constraint VALUES($1, $2); + Post Parse | Post Parse + Planner | Planner + Executor | ExecutorStart + Executor | ExecutorRun + Insert | Insert on pg_tracing_test_table_with_constraint + Result | Result + Executor | ExecutorFinish + Executor | ExecutorEnd +(10 rows) + +-- Trigger constraint violation +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test_table_with_constraint VALUES(1, 'aaa'); +ERROR: duplicate key value violates unique constraint "pk_tracing_test" +DETAIL: Key (a)=(1) already exists. +-- Check violation spans +SELECT span_type, span_operation, sql_error_code from pg_tracing_consume_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + span_type | span_operation | sql_error_code +------------+-------------------------------------------------------------------+---------------- + Parse | Parse | 00000 + Insert | INSERT INTO pg_tracing_test_table_with_constraint VALUES($1, $2); | 23505 + Post Parse | Post Parse | 00000 + Planner | Planner | 00000 + Executor | ExecutorStart | 00000 + Executor | ExecutorRun | 23505 + Insert | Insert on pg_tracing_test_table_with_constraint | 23505 + Result | Result | 23505 +(8 rows) + +-- Trigger an error while calling pg_tracing_peek_spans which resets tracing +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test_table_with_constraint VALUES(length((select sql_error_code from public.pg_tracing_peek_spans)), 'aaa'); +ERROR: null value in column "a" of relation "pg_tracing_test_table_with_constraint" violates not-null constraint +DETAIL: Failing row contains (null, aaa ). +-- Nothing should be generated +SELECT span_type, span_operation, sql_error_code from pg_tracing_consume_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + span_type | span_operation | sql_error_code +-----------+----------------+---------------- +(0 rows) + diff --git a/contrib/pg_tracing/expected/nested.out b/contrib/pg_tracing/expected/nested.out new file mode 100644 index 0000000000..77e6bd32a0 --- /dev/null +++ b/contrib/pg_tracing/expected/nested.out @@ -0,0 +1,317 @@ +-- Create test function to sample +CREATE OR REPLACE FUNCTION test_function_project_set(a int) RETURNS SETOF oid AS +$BODY$ +BEGIN + RETURN QUERY SELECT oid from pg_class where oid = a; +END; +$BODY$ +LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION test_function_result(a int, b text) RETURNS void AS +$BODY$ +BEGIN + INSERT INTO pg_tracing_test(a, b) VALUES (a, b); +END; +$BODY$ +LANGUAGE plpgsql; +-- Trace a statement with a function call +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000051-0000000000000051-01'*/ select test_function_project_set(1); + test_function_project_set +--------------------------- +(0 rows) + +-- The test function call will generate the following spans (non exhaustive list): +-- +---------------------------------------------------------------------------------------------------+ +-- | A: Select test_function_project_set(1); | +-- +-+--------++----------++----------------++--------------------------------------------------------++ +-- |B: Parse||C: Planner||D: ExecutorStart||E: ExecutorRun | +-- +--------++----------++----------------+++-----------------------------------------------------+-+ +-- |F: ProjectSet | +-- ++---------+---------+------------------------------+-+ +-- |G: Result|H: Parse | I: Select a from b where... | +-- +---------+---------+----+--------------+----------+ +-- |J: ExecutorRun| +-- +--------------+ +-- Gather span_id, span start and span end of function call statement +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=81 and span_type!='Parse' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_a_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_f_id, + get_span_start_ns(span_start, span_start_ns) as span_f_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_f_end + from pg_tracing_peek_spans where parent_id=:span_e_id and span_type='ProjectSet' \gset +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Result' \gset +SELECT span_id AS span_h_id, + get_span_start_ns(span_start, span_start_ns) as span_h_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_h_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Parse' \gset +SELECT span_id AS span_i_id, + get_span_start_ns(span_start, span_start_ns) as span_i_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_i_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Select' \gset +SELECT span_id AS span_j_id, + get_span_start_ns(span_start, span_start_ns) as span_j_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_j_end + from pg_tracing_peek_spans where parent_id=:span_i_id and span_operation='ExecutorRun' \gset +-- Check that spans' start and end are within expection +SELECT :span_a_start < :span_e_start AS top_query_before_run, + :span_a_end >= :span_e_end AS top_ends_after_run_end, + :span_e_start <= :span_f_start AS top_run_starts_before_project, + :span_e_end >= :span_f_end AS top_run_ends_after_project_end, + :span_e_end >= :span_h_end AS top_run_ends_before_select_end, + :span_e_end >= :span_i_end AS top_run_ends_after_nested_run_end; + top_query_before_run | top_ends_after_run_end | top_run_starts_before_project | top_run_ends_after_project_end | top_run_ends_before_select_end | top_run_ends_after_nested_run_end +----------------------+------------------------+-------------------------------+--------------------------------+--------------------------------+----------------------------------- + t | t | t | t | t | t +(1 row) + +SELECT + :span_g_end >= :span_h_start AS nested_result_ends_before_parse, + :span_h_end <= :span_i_start AS nested_parse_ends_before_select, + :span_j_start >= :span_i_start AS run_starts_after_parent_select, + :span_j_end <= :span_i_end AS run_ends_after_select_end; + nested_result_ends_before_parse | nested_parse_ends_before_select | run_starts_after_parent_select | run_ends_after_select_end +---------------------------------+---------------------------------+--------------------------------+--------------------------- + t | t | t | t +(1 row) + +-- Check that the root span is the longest one +WITH max_duration AS (select max(duration) from pg_tracing_peek_spans) +SELECT duration = max_duration.max from pg_tracing_peek_spans, max_duration + where span_id = :span_a_id; + ?column? +---------- + t +(1 row) + +-- Check that ExecutorRun is attached to the nested top span +SELECT span_operation, deparse_info from pg_tracing_peek_spans where parent_id=:span_j_id order by span_operation; + span_operation | deparse_info +----------------------------------------------------+------------------------------ + IndexOnlyScan using pg_class_oid_index on pg_class | Index Cond: (oid = '1'::oid) +(1 row) + +-- Check tracking with top tracking +set pg_tracing.track = 'top'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000052-0000000000000052-01'*/ select test_function_project_set(1); + test_function_project_set +--------------------------- +(0 rows) + +SELECT count(*) from pg_tracing_consume_spans where trace_id=82; + count +------- + 10 +(1 row) + +-- Check tracking with no tracking +set pg_tracing.track = 'none'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000053-0000000000000053-01'*/ select test_function_project_set(1); + test_function_project_set +--------------------------- +(0 rows) + +SELECT count(*) from pg_tracing_consume_spans where trace_id=83; + count +------- + 0 +(1 row) + +-- Reset tracking setting +set pg_tracing.track = 'all'; +-- Create test procedure +CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ +DECLARE + r int; +BEGIN + SELECT (i + i)::int INTO r; +END; $$ LANGUAGE plpgsql; +-- Test tracking of procedure with utility tracking enabled +set pg_tracing.track_utility=on; +/*traceparent='00-00000000000000000000000000000054-0000000000000054-01'*/ CALL sum_one(3); +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_operation +--------------------- + Parse + CALL sum_one($1); + Post Parse + Parse + ProcessUtility + Post Parse + SELECT (i + i)::int + Planner + Planner + ExecutorStart + ExecutorRun + Result + ExecutorFinish + ExecutorEnd +(14 rows) + +-- Test again with utility tracking disabled +set pg_tracing.track_utility=off; +/*traceparent='00-00000000000000000000000000000055-0000000000000055-01'*/ CALL sum_one(10); +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_operation +---------------- +(0 rows) + +-- Create immutable function +CREATE OR REPLACE FUNCTION test_immutable_function(a int) RETURNS oid +AS 'SELECT oid from pg_class where oid = a;' +LANGUAGE sql IMMUTABLE; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000056-0000000000000056-01'*/ select test_immutable_function(1); + test_immutable_function +------------------------- + +(1 row) + +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_operation +------------------------------------- + Parse + Post Parse + select test_immutable_function($1); + Planner + Planner + ExecutorStart + ExecutorRun + Result + ExecutorFinish + ExecutorEnd +(10 rows) + +-- Create function with generate series +CREATE OR REPLACE FUNCTION test_generate_series(IN anyarray, OUT x anyelement) + RETURNS SETOF anyelement + LANGUAGE sql + AS 'select * from pg_catalog.generate_series(array_lower($1, 1), array_upper($1, 1), 1)'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000057-0000000000000057-01'*/ select test_generate_series('{1,2,3,4}'::int[]) FROM (VALUES (1,2)); + test_generate_series +---------------------- + 1 + 2 + 3 + 4 +(4 rows) + +SELECT span_id AS span_project_set_id, + get_span_start_ns(span_start, span_start_ns) as span_project_set_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_project_set_end + from pg_tracing_peek_spans where span_type='ProjectSet' \gset +SELECT span_id AS span_result_id, + get_span_start_ns(span_start, span_start_ns) as span_result_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_result_end + from pg_tracing_peek_spans where parent_id=:span_project_set_id and span_type='Result' \gset +SELECT span_id AS span_parse, + get_span_start_ns(span_start, span_start_ns) as span_parse_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_parse_end + from pg_tracing_peek_spans where parent_id=:span_project_set_id and span_type='Parse' \gset +-- Check that spans' start and end are within expection +SELECT :span_project_set_start < :span_parse_start AS project_set_starts_before_parse, + :span_result_start < :span_parse_start AS result_starts_before_parse, + :span_result_end <= :span_parse_start AS result_ends_before_parse, + :span_project_set_end > :span_parse_end AS project_set_ends_after_parse; + project_set_starts_before_parse | result_starts_before_parse | result_ends_before_parse | project_set_ends_after_parse +---------------------------------+----------------------------+--------------------------+------------------------------ + t | t | t | t +(1 row) + +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_operation +---------------------------------------------------------------------------------------- + Parse + Post Parse + select test_generate_series($1::int[]) FROM (VALUES ($2,$3)); + Planner + ExecutorStart + ExecutorRun + ProjectSet + Result + Parse + Post Parse + select * from pg_catalog.generate_series(array_lower($1, $2), array_upper($1, $3), $4) + Planner + ExecutorFinish + ExecutorEnd +(14 rows) + +-- +---------------------------------------------------------------------------------------------------+ +-- | A: Select test_function(1); | +-- +-+--------++----------++----------------++--------------------------------------------------------++ +-- |B: Parse||C: Planner||D: ExecutorStart||E: ExecutorRun | +-- +--------++----------++----------------+++-----------------------------------------------------+-+ +-- |F: Result | +-- ++---------------+----------------------------------+-+ +-- |G: Parse | H: Insert INTO... | +-- +---------------+--------+--------------+----------+ +-- |I: ExecutorRun| +-- +--------------+ +-- Check function with result node +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000058-0000000000000058-01'*/ select test_function_result(1, 'test'); + test_function_result +---------------------- + +(1 row) + +-- Gather span_id, span start and span end of function call statement +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=88 and span_type!='Parse' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_a_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_f_id, + get_span_start_ns(span_start, span_start_ns) as span_f_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_f_end + from pg_tracing_peek_spans where parent_id=:span_e_id and span_type='Result' \gset +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Parse' \gset +SELECT span_id AS span_h_id, + get_span_start_ns(span_start, span_start_ns) as span_h_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_h_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Insert' \gset +-- Check that parse span is correctly positioned +SELECT :span_g_start >= :span_f_start AS parse_start_after_result, + :span_g_end < :span_f_end AS parse_ends_before_result, + :span_g_end <= :span_h_start AS parse_ends_before_insert_node; + parse_start_after_result | parse_ends_before_result | parse_ends_before_insert_node +--------------------------+--------------------------+------------------------------- + t | t | t +(1 row) + +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_operation +------------------------------------------------- + Parse + Post Parse + select test_function_result($1, $2); + Planner + ExecutorStart + ExecutorRun + Parse + Result + INSERT INTO pg_tracing_test(a, b) VALUES (a, b) + Post Parse + Planner + ExecutorStart + ExecutorRun + Insert on pg_tracing_test + Result + ExecutorFinish + ExecutorEnd + ExecutorFinish + ExecutorEnd +(19 rows) + diff --git a/contrib/pg_tracing/expected/parallel.out b/contrib/pg_tracing/expected/parallel.out new file mode 100644 index 0000000000..d5cc1b7bd8 --- /dev/null +++ b/contrib/pg_tracing/expected/parallel.out @@ -0,0 +1,55 @@ +begin; +-- encourage use of parallel plans +set local parallel_setup_cost=0; +set local parallel_tuple_cost=0; +set local min_parallel_table_scan_size=0; +set local max_parallel_workers_per_gather=2; +-- Trace parallel queries +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ select 1 from pg_class limit 1; + ?column? +---------- + 1 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ select 2 from pg_class limit 2; + ?column? +---------- + 2 + 2 +(2 rows) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-00'*/ select 3 from pg_class limit 3; + ?column? +---------- + 3 + 3 + 3 +(3 rows) + +commit; +-- Worker can take some additional time to end and report their spans +SELECT pg_sleep(0.2); + pg_sleep +---------- + +(1 row) + +-- Get root top span id +SELECT span_id as root_span_id from pg_tracing_peek_spans where span_type='Select' and trace_id=1 and parent_id=1 \gset +-- Get executor top span id +SELECT span_id as executor_span_id from pg_tracing_peek_spans where span_operation='ExecutorRun' and trace_id=1 and parent_id=:root_span_id \gset +-- Check the select spans that are attached to the root top span +SELECT trace_id, span_type, span_operation from pg_tracing_peek_spans where span_type='Select' and parent_id=:executor_span_id order by span_operation; + trace_id | span_type | span_operation +----------+-----------+---------------- + 1 | Select | Worker 0 + 1 | Select | Worker 1 +(2 rows) + +-- Check number of executor spans +SELECT count(*) from pg_tracing_consume_spans where span_type='Executor'; + count +------- + 24 +(1 row) + diff --git a/contrib/pg_tracing/expected/reset.out b/contrib/pg_tracing/expected/reset.out new file mode 100644 index 0000000000..2e0c903121 --- /dev/null +++ b/contrib/pg_tracing/expected/reset.out @@ -0,0 +1,13 @@ +-- Check reset is working +SELECT pg_tracing_reset(); + pg_tracing_reset +------------------ + +(1 row) + +SELECT traces from pg_tracing_info; + traces +-------- + 0 +(1 row) + diff --git a/contrib/pg_tracing/expected/sample.out b/contrib/pg_tracing/expected/sample.out new file mode 100644 index 0000000000..1e4dc33694 --- /dev/null +++ b/contrib/pg_tracing/expected/sample.out @@ -0,0 +1,162 @@ +-- Trace nothing +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 0.0; +-- Query with sampling flag +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000021-0000000000000021-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- Query without trace context +SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- No spans should have been generated +select count(distinct(trace_id)) from pg_tracing_consume_spans; + count +------- + 0 +(1 row) + +-- Enable full sampling +SET pg_tracing.sample_rate = 1.0; +-- Generate queries with sampling flag on, off and no trace context +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000022-0000000000000022-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000023-0000000000000023-00'*/ SELECT 2; + ?column? +---------- + 2 +(1 row) + +SELECT 3; + ?column? +---------- + 3 +(1 row) + +SELECT 4; + ?column? +---------- + 4 +(1 row) + +-- Check number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; + count +------- + 4 +(1 row) + +-- Check span order +select span_operation, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; + span_operation | parameters +----------------+------------ + Parse | + SELECT $1; | $1 = 1 + Post Parse | + Planner | + ExecutorStart | + ExecutorRun | + Result | + ExecutorFinish | + ExecutorEnd | + Parse | + SELECT $1; | $1 = 2 + Post Parse | + Planner | + ExecutorStart | + ExecutorRun | + Result | + ExecutorFinish | + ExecutorEnd | + Parse | + SELECT $1; | $1 = 3 + Post Parse | + Planner | + ExecutorStart | + ExecutorRun | + Result | + ExecutorFinish | + ExecutorEnd | + Parse | + SELECT $1; | $1 = 4 + Post Parse | + Planner | + ExecutorStart | + ExecutorRun | + Result | + ExecutorFinish | + ExecutorEnd | +(36 rows) + +-- Top spans should reuse generated ids and have trace_id = parent_id +select span_operation, parameters from pg_tracing_consume_spans where trace_id = parent_id order by span_start, span_start_ns; + span_operation | parameters +----------------+------------ + Parse | + SELECT $1; | $1 = 1 + Parse | + SELECT $1; | $1 = 2 + Parse | + SELECT $1; | $1 = 3 + Parse | + SELECT $1; | $1 = 4 +(8 rows) + +-- Only trace query with sampled flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; +-- Generate queries with sampling flag on, off and no trace context +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000024-0000000000000024-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000025-0000000000000025-00'*/ SELECT 2; + ?column? +---------- + 2 +(1 row) + +SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- Check number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; + count +------- + 2 +(1 row) + +-- Check span ordering +select span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns; + span_operation | parameters +-----------------------------------+------------ + Parse | + SET pg_tracing.sample_rate = 0.0; | + Post Parse | + ProcessUtility | + Parse | + Post Parse | + SELECT $1; | $1 = 1 + Planner | + ExecutorStart | + ExecutorRun | + Result | + ExecutorFinish | + ExecutorEnd | +(13 rows) + diff --git a/contrib/pg_tracing/expected/select.out b/contrib/pg_tracing/expected/select.out new file mode 100644 index 0000000000..3652304c5f --- /dev/null +++ b/contrib/pg_tracing/expected/select.out @@ -0,0 +1,350 @@ +-- Only trace queries with sample flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; +-- Run a simple query +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- Get top span id +SELECT span_id AS top_span_id from pg_tracing_peek_spans where parent_id=1 and span_type!='Parse' \gset +-- Check parameters +SELECT parameters from pg_tracing_peek_spans where span_id=:top_span_id; + parameters +------------ + $1 = 1 +(1 row) + +-- Check the number of children +SELECT count(*) from pg_tracing_peek_spans where parent_id=:'top_span_id'; + count +------- + 6 +(1 row) + +-- Check span_operation +SELECT span_type, span_operation from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + span_type | span_operation +------------+---------------- + Parse | Parse + Post Parse | Post Parse + Select | SELECT $1; + Planner | Planner + Executor | ExecutorStart + Executor | ExecutorRun + Result | Result + Executor | ExecutorFinish + Executor | ExecutorEnd +(9 rows) + +-- Check count of query_id +SELECT count(distinct query_id) from pg_tracing_peek_spans where trace_id=1; + count +------- + 1 +(1 row) + +-- Check reported number of trace +SELECT traces from pg_tracing_info; + traces +-------- + 20 +(1 row) + +-- Trace a statement with function call +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ SELECT count(*) from current_database(); + count +------- + 1 +(1 row) + +-- Check the generated span span_type, span_operation and order of function call +SELECT span_type, span_operation from pg_tracing_consume_spans where trace_id=3 order by span_operation; + span_type | span_operation +--------------+------------------------------------------ + Aggregate | Aggregate + Executor | ExecutorEnd + Executor | ExecutorFinish + Executor | ExecutorRun + Executor | ExecutorStart + FunctionScan | FunctionScan on current_database + Parse | Parse + Planner | Planner + Post Parse | Post Parse + Select | SELECT count(*) from current_database(); +(10 rows) + +-- Trace a more complex query with multiple function calls +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000004-0000000000000004-01'*/ SELECT s.relation_size + s.index_size +FROM (SELECT + pg_relation_size(C.oid) as relation_size, + pg_indexes_size(C.oid) as index_size + FROM pg_class C) as s limit 1; + ?column? +---------- + 0 +(1 row) + +-- Check the generated span span_type, span_operation and order of query with multiple function calls +SELECT span_type, span_operation from pg_tracing_consume_spans where trace_id=4 order by span_operation; + span_type | span_operation +--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------ + Executor | ExecutorEnd + Executor | ExecutorFinish + Executor | ExecutorRun + Executor | ExecutorStart + Limit | Limit + Parse | Parse + Planner | Planner + Post Parse | Post Parse + Select | SELECT s.relation_size + s.index_sizeFROM (SELECT pg_relation_size(C.oid) as relation_size, pg_indexes_size(C.oid) as index_size FROM pg_class C) as s limit $1; + SeqScan | SeqScan on pg_class c + SubqueryScan | SubqueryScan on s +(11 rows) + +-- Check that we're in a correct state after a timeout +set statement_timeout=200; +-- Trace query triggering a statement timeout +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000007-0000000000000007-01'*/ select * from pg_sleep(10); +ERROR: canceling statement due to statement timeout +-- Cleanup statement setting +set statement_timeout=0; +-- Trace a working query after the timeout to check we're in a consistent state +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000008-0000000000000008-01'*/ select 1; + ?column? +---------- + 1 +(1 row) + +-- Check the spans order and error code +SELECT span_type, span_operation, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_type | span_operation | sql_error_code +--------------+-----------------------------+---------------- + Parse | Parse | 00000 + Post Parse | Post Parse | 00000 + Select | select * from pg_sleep($1); | 57014 + Planner | Planner | 00000 + Executor | ExecutorStart | 00000 + Executor | ExecutorRun | 57014 + FunctionScan | FunctionScan on pg_sleep | 57014 + Parse | Parse | 00000 + Post Parse | Post Parse | 00000 + Select | select $1; | 00000 + Planner | Planner | 00000 + Executor | ExecutorStart | 00000 + Executor | ExecutorRun | 00000 + Result | Result | 00000 + Executor | ExecutorFinish | 00000 + Executor | ExecutorEnd | 00000 +(16 rows) + +-- Create a prepare statement with comment passed as first parameter +PREPARE test_prepared (text, integer) AS /*$1*/ SELECT $2; +-- Execute prepare statement with trace context passed as a parameter +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', 1); + ?column? +---------- + 1 +(1 row) + +-- Check generated spans and order +SELECT trace_id, span_type, span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + trace_id | span_type | span_operation | parameters +----------+-----------+-----------------------------------------------------+---------------------------------------------------------------------------------------------------------------- + 9 | Select | PREPARE test_prepared (text, integer) AS SELECT $2; | $1 = 'dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', $2 = '1' + 9 | Planner | Planner | + 9 | Executor | ExecutorStart | + 9 | Executor | ExecutorRun | + 9 | Result | Result | + 9 | Executor | ExecutorFinish | + 9 | Executor | ExecutorEnd | +(7 rows) + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +-- Execute prepare statement with trace context passed as a parameter and generic plan +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000010-0000000000000010-01''', 10); + ?column? +---------- + 10 +(1 row) + +-- Cleanup +SET plan_cache_mode='auto'; +DEALLOCATE test_prepared; +-- Check spans are generated even through generic plan +SELECT trace_id, span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + trace_id | span_operation | parameters +----------+-----------------------------------------------------+------------ + 16 | ExecutorStart | + 16 | PREPARE test_prepared (text, integer) AS SELECT $2; | + 16 | ExecutorRun | + 16 | Result | + 16 | ExecutorFinish | + 16 | ExecutorEnd | +(6 rows) + +-- Run a statement with node not executed +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000a-000000000000000a-01'*/ select 1 limit 0; + ?column? +---------- +(0 rows) + +-- Not executed node should not generate any spans +SELECT trace_id, span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + trace_id | span_operation | parameters +----------+---------------------+---------------- + 10 | Parse | + 10 | Post Parse | + 10 | select $1 limit $2; | $1 = 1, $2 = 0 + 10 | Planner | + 10 | ExecutorStart | + 10 | ExecutorRun | + 10 | Limit | + 10 | ExecutorFinish | + 10 | ExecutorEnd | +(9 rows) + +-- Test multiple statements in a single query +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000011-0000000000000012-01'*/ select 1; select 2; + ?column? +---------- + 1 +(1 row) + + ?column? +---------- + 2 +(1 row) + +-- Not executed node should not generate any spans +select span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns; + span_operation | parameters +----------------+------------ + Parse | + select $1; | $1 = 1 + Post Parse | + Planner | + ExecutorStart | + ExecutorRun | + Result | + ExecutorFinish | + ExecutorEnd | +(9 rows) + +-- Check that parameters are not exported when disabled +SET pg_tracing.export_parameters=false; +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000b-000000000000000b-01'*/ select 1, 2, 3; + ?column? | ?column? | ?column? +----------+----------+---------- + 1 | 2 | 3 +(1 row) + +SELECT trace_id, span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + trace_id | span_operation | parameters +----------+--------------------+------------ + 11 | Parse | + 11 | Post Parse | + 11 | select $1, $2, $3; | + 11 | Planner | + 11 | ExecutorStart | + 11 | ExecutorRun | + 11 | Result | + 11 | ExecutorFinish | + 11 | ExecutorEnd | +(9 rows) + +-- Reset export parameters setting +SET pg_tracing.export_parameters=true; +-- Check the result of disabling deparse +SET pg_tracing.deparse_plan=false; +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000c-000000000000000c-01'*/ SELECT * from pg_tracing_test where a=1; + a | b +---+--- +(0 rows) + +SET pg_tracing.deparse_plan=true; +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000d-000000000000000d-01'*/ SELECT * from pg_tracing_test where a=1; + a | b +---+--- +(0 rows) + +SELECT trace_id, span_operation, deparse_info, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + trace_id | span_operation | deparse_info | parameters +----------+-------------------------------------------+-----------------------+------------ + 12 | Parse | | + 12 | Post Parse | | + 12 | SELECT * from pg_tracing_test where a=$1; | | $1 = 1 + 12 | Planner | | + 12 | ExecutorStart | | + 12 | ExecutorRun | | + 12 | BitmapHeapScan on pg_tracing_test | | + 12 | BitmapIndexScan on pg_tracing_index | | + 12 | ExecutorFinish | | + 12 | ExecutorEnd | | + 13 | Parse | | + 13 | Post Parse | | + 13 | SELECT * from pg_tracing_test where a=$1; | | $1 = 1 + 13 | Planner | | + 13 | ExecutorStart | | + 13 | ExecutorRun | | + 13 | BitmapHeapScan on pg_tracing_test | Recheck Cond: (a = 1) | + 13 | BitmapIndexScan on pg_tracing_index | Index Cond: (a = 1) | + 13 | ExecutorFinish | | + 13 | ExecutorEnd | | +(20 rows) + +-- Check multi statement query +SET pg_tracing.sample_rate = 1.0; +-- Force a multi-query statement with \; +SELECT 1\; SELECT 1, 2; + ?column? +---------- + 1 +(1 row) + + ?column? | ?column? +----------+---------- + 1 | 2 +(1 row) + +SELECT span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_operation | parameters +----------------+------------ + Parse | + Post Parse | + SELECT $1; | $1 = 1 + Planner | + ExecutorStart | + ExecutorRun | + Result | + ExecutorFinish | + ExecutorEnd | +(9 rows) + +-- Check standalone trace +SELECT 1; + ?column? +---------- + 1 +(1 row) + +-- Make sure we have unique span ids +select count(span_id) from pg_tracing_consume_spans group by span_id; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 +(9 rows) + +-- Cleanup +SET pg_tracing.sample_rate = 0.0; diff --git a/contrib/pg_tracing/expected/subxact.out b/contrib/pg_tracing/expected/subxact.out new file mode 100644 index 0000000000..a99f6d42ac --- /dev/null +++ b/contrib/pg_tracing/expected/subxact.out @@ -0,0 +1,73 @@ +-- Enable full sampling +SET pg_tracing.sample_rate = 1.0; +-- Start a transaction with subxaction +BEGIN; +SAVEPOINT s1; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s2; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s3; +SELECT 1; + ?column? +---------- + 1 +(1 row) + +COMMIT; +-- Check that subxact_count is correctly reported +select span_operation, parameters, subxact_count from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + span_operation | parameters | subxact_count +------------------------------------------------------------------+----------------------------+--------------- + Parse | | 0 + BEGIN; | | 0 + Post Parse | | 0 + ProcessUtility | | 0 + Parse | | 0 + Post Parse | | 0 + SAVEPOINT $1; | $1 = s1 | 0 + ProcessUtility | | 0 + Parse | | 0 + INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | $1 = 1, $2 = 2, $3 = 'aaa' | 0 + Post Parse | | 0 + Planner | | 0 + ExecutorStart | | 0 + ExecutorRun | | 0 + Insert on pg_tracing_test | | 1 + ProjectSet | | 1 + Result | | 1 + ExecutorFinish | | 1 + ExecutorEnd | | 1 + Parse | | 1 + Post Parse | | 1 + SAVEPOINT $1; | $1 = s2 | 1 + ProcessUtility | | 1 + Parse | | 1 + INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | $1 = 1, $2 = 2, $3 = 'aaa' | 1 + Post Parse | | 1 + Planner | | 1 + ExecutorStart | | 1 + ExecutorRun | | 1 + Insert on pg_tracing_test | | 2 + ProjectSet | | 2 + Result | | 2 + ExecutorFinish | | 2 + ExecutorEnd | | 2 + Parse | | 2 + Post Parse | | 2 + SAVEPOINT $1; | $1 = s3 | 2 + ProcessUtility | | 2 + Parse | | 2 + Post Parse | | 2 + SELECT $1; | $1 = 1 | 2 + Planner | | 2 + ExecutorStart | | 2 + ExecutorRun | | 2 + Result | | 2 + ExecutorFinish | | 2 + ExecutorEnd | | 2 + Parse | | 2 + COMMIT; | | 2 + Post Parse | | 2 + ProcessUtility | | 2 +(51 rows) + diff --git a/contrib/pg_tracing/expected/trigger.out b/contrib/pg_tracing/expected/trigger.out new file mode 100644 index 0000000000..8d9a8b1b8b --- /dev/null +++ b/contrib/pg_tracing/expected/trigger.out @@ -0,0 +1,271 @@ +-- Only trace queries with sample flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; +-- Create test tables +CREATE TABLE Employee ( + EmployeeId INT NOT NULL, + LastName VARCHAR(20) NOT NULL, + CONSTRAINT PK_Employee PRIMARY KEY (EmployeeId)); +CREATE TABLE Employee_Audit ( + EmployeeId INT NOT NULL, + LastName VARCHAR(20) NOT NULL, + EmpAdditionTime VARCHAR(20) NOT NULL); +-- Create test trigger +CREATE OR REPLACE FUNCTION employee_insert_trigger_fnc() + RETURNS trigger AS +$$ +BEGIN + INSERT INTO Employee_Audit (EmployeeId, LastName, EmpAdditionTime) + VALUES(NEW.EmployeeId,NEW.LastName,current_date); +RETURN NEW; +END; +$$ +LANGUAGE plpgsql; +-- Hook the trigger twice +CREATE TRIGGER employee_insert_trigger + AFTER INSERT + ON Employee + FOR EACH ROW + EXECUTE PROCEDURE employee_insert_trigger_fnc(); +CREATE TRIGGER employee_insert_trigger_2 + AFTER INSERT + ON Employee + FOR EACH ROW + EXECUTE PROCEDURE employee_insert_trigger_fnc(); +-- Call update +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO Employee VALUES(10,'Adams'); +-- +-- +--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- |A: Parse| B: INSERT INTO Employee | +-- +--------++----------+-+--------------------------+-----------------------------------------------------------------------------------------------------------------+---------------++ +-- |C: Planner| |D: ExecutorRun | G: ExecutorFinish |R: ExecutorEnd | +-- +----------+ ++-----------------------+-+--+--------+-------------------------------------------------------------+-+-------------------------------------+---------------+ +-- |E: Insert into employee| |H: Parse| I: INSERT INTO Employee_Audit... | | O: INSERT INTO Employee_Audit... | +-- ++---------+------------+ +--------+-+----------++----------------------------+---+--------------+ ++---------------+-----+--------------+ +-- |F: Result| |J: Planner||K: ExecutorRun |...|N: ExecutorEnd| |P: ExecutorRun | ... |Q: ExecutorEnd| +-- +---------+ +----------+++---------------------------+ +--------------+ ++--------------+ +--------------+ +-- |L: Insert on employee audit| | ... | +-- +-+---------+---------------+ +--------------+ +-- |M: Result| +-- +---------+ +-- Gather span_id, span start and span end of call with triggers +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=1 and span_type='Parse' \gset +SELECT span_id AS span_b_id, + get_span_start_ns(span_start, span_start_ns) as span_b_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_b_end + from pg_tracing_peek_spans where parent_id=1 and span_type!='Parse' \gset +SELECT span_id AS span_c_id, + get_span_start_ns(span_start, span_start_ns) as span_c_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_c_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Planner' \gset +SELECT span_id AS span_d_id, + get_span_start_ns(span_start, span_start_ns) as span_d_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_d_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_d_id and span_type='Insert' \gset +SELECT span_id AS span_f_id, + get_span_start_ns(span_start, span_start_ns) as span_f_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_f_end + from pg_tracing_peek_spans where parent_id=:span_e_id and span_type='Result' \gset +-- Executor Finish, first trigger +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorFinish' \gset +SELECT span_id AS span_h_id, + get_span_start_ns(span_start, span_start_ns) as span_h_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_h_end + from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Parse' \gset +SELECT span_id AS span_i_id, + get_span_start_ns(span_start, span_start_ns) as span_i_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_i_end + from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Insert' limit 1 \gset +SELECT span_id AS span_j_id, + get_span_start_ns(span_start, span_start_ns) as span_j_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_j_end + from pg_tracing_peek_spans where parent_id=:span_i_id and span_type='Planner' \gset +SELECT span_id AS span_k_id, + get_span_start_ns(span_start, span_start_ns) as span_k_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_k_end + from pg_tracing_peek_spans where parent_id=:span_i_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_l_id, + get_span_start_ns(span_start, span_start_ns) as span_l_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_l_end + from pg_tracing_peek_spans where parent_id=:span_k_id and span_type='Insert' \gset +SELECT span_id AS span_m_id, + get_span_start_ns(span_start, span_start_ns) as span_m_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_m_end + from pg_tracing_peek_spans where parent_id=:span_l_id and span_type='Result' \gset +SELECT span_id AS span_n_id, + get_span_start_ns(span_start, span_start_ns) as span_n_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_n_end + from pg_tracing_peek_spans where parent_id=:span_i_id and span_type='Executor' and span_operation='ExecutorEnd' \gset +-- Executor Finish, second trigger +SELECT span_id AS span_o_id, + get_span_start_ns(span_start, span_start_ns) as span_o_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_o_end + from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Insert' limit 1 offset 1 \gset +SELECT span_id AS span_p_id, + get_span_start_ns(span_start, span_start_ns) as span_p_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_p_end + from pg_tracing_peek_spans where parent_id=:span_o_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_q_id, + get_span_start_ns(span_start, span_start_ns) as span_q_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_q_end + from pg_tracing_peek_spans where parent_id=:span_o_id and span_type='Executor' and span_operation='ExecutorEnd' \gset +-- End +SELECT span_id AS span_r_id, + get_span_start_ns(span_start, span_start_ns) as span_r_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_r_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorEnd' \gset +-- Check that spans' start and end are within expection +SELECT +-- Root span + :span_a_end <= :span_b_start as parse_before_root_span, +-- Planner + :span_c_start >= :span_b_start as planner_starts_after_root_span, +-- ExecutorRun + :span_d_start >= :span_c_end as executor_run_starts_after_planner, + :span_d_end <= :span_g_start as executor_run_ends_before_finish, +-- First Insert + :span_e_start >= :span_d_start as insert_starts_after_executor_run, + :span_e_start <= :span_f_start as insert_ends_before_executor_run, + :span_e_end >= :span_f_end as insert_starts_before_child, + :span_e_end <= :span_d_end as insert_ends_after_child, +-- ExecutorFinish + :span_g_start <= :span_i_start as executor_finish_starts_before_child, + :span_g_end >= :span_o_end as executor_finish_ends_after_child, +-- First trigger + :span_h_end <= :span_i_start as parse_ends_before_first_trigger, + :span_i_end >= :span_n_end as first_trigger_ends_after_child, +-- Second trigger + :span_o_start >= :span_i_end as second_trigger_starts_after_first, + :span_o_end >= :span_q_end as second_trigger_ends_after_child, + :span_q_end <= :span_r_start as second_trigger_ends_before_executor_end, + :span_o_end <= :span_r_start as second_trigger_ends_before_root_executor_end, +-- Root ExecutorEnd + :span_r_start >= :span_g_end as executor_end_start_after_executor_finish, + :span_r_end <= :span_b_end as executor_end_ends_before_root; + parse_before_root_span | planner_starts_after_root_span | executor_run_starts_after_planner | executor_run_ends_before_finish | insert_starts_after_executor_run | insert_ends_before_executor_run | insert_starts_before_child | insert_ends_after_child | executor_finish_starts_before_child | executor_finish_ends_after_child | parse_ends_before_first_trigger | first_trigger_ends_after_child | second_trigger_starts_after_first | second_trigger_ends_after_child | second_trigger_ends_before_executor_end | second_trigger_ends_before_root_executor_end | executor_end_start_after_executor_finish | executor_end_ends_before_root +------------------------+--------------------------------+-----------------------------------+---------------------------------+----------------------------------+---------------------------------+----------------------------+-------------------------+-------------------------------------+----------------------------------+---------------------------------+--------------------------------+-----------------------------------+---------------------------------+-----------------------------------------+----------------------------------------------+------------------------------------------+------------------------------- + t | t | t | t | t | t | t | t | t | t | t | t | t | t | t | t | t | t +(1 row) + +-- Check span_operation +SELECT span_type, span_operation from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + span_type | span_operation +------------+--------------------------------------------------------------------------------------------------------------------- + Parse | Parse + Insert | INSERT INTO Employee VALUES($1,$2); + Post Parse | Post Parse + Planner | Planner + Executor | ExecutorStart + Executor | ExecutorRun + Insert | Insert on employee + Result | Result + Executor | ExecutorFinish + Parse | Parse + Insert | INSERT INTO Employee_Audit (EmployeeId, LastName, EmpAdditionTime) VALUES(NEW.EmployeeId,NEW.LastName,current_date) + Post Parse | Post Parse + Planner | Planner + Executor | ExecutorStart + Executor | ExecutorRun + Insert | Insert on employee_audit + Result | Result + Executor | ExecutorFinish + Executor | ExecutorEnd + Insert | INSERT INTO Employee_Audit (EmployeeId, LastName, EmpAdditionTime) VALUES(NEW.EmployeeId,NEW.LastName,current_date) + Post Parse | Post Parse + Planner | Planner + Executor | ExecutorStart + Executor | ExecutorRun + Insert | Insert on employee_audit + Result | Result + Executor | ExecutorFinish + Executor | ExecutorEnd + Executor | ExecutorEnd +(29 rows) + +-- Check count of query_id +SELECT count(distinct query_id) from pg_tracing_consume_spans where trace_id=1; + count +------- + 2 +(1 row) + +-- Second call, parse should not be present anymore +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO Employee VALUES(11,'Adams'); +SELECT span_id AS span_b_id, + get_span_start_ns(span_start, span_start_ns) as span_b_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_b_end + from pg_tracing_peek_spans where parent_id=1 and span_type!='Parse' \gset +SELECT span_id AS span_d_id, + get_span_start_ns(span_start, span_start_ns) as span_d_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_d_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorFinish' \gset +-- Check we don't have Parse span anymore +SELECT count(*) = 0 from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Parse'; + ?column? +---------- + t +(1 row) + +-- Insert should be attached to the correct parent +SELECT count(*) = 1 from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Insert' limit 1 \gset +invalid variable name: "?column?" +-- Check that ExecutorRun ends before ExecutorFinish start +SELECT :span_d_end <= :span_g_start as finish_start_after_run; + finish_start_after_run +------------------------ + t +(1 row) + +-- Check generated spans for the second call with trigger +SELECT span_type, span_operation from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + span_type | span_operation +------------+--------------------------------------------------------------------------------------------------------------------- + Parse | Parse + Insert | INSERT INTO Employee VALUES($1,$2); + Post Parse | Post Parse + Planner | Planner + Executor | ExecutorStart + Executor | ExecutorRun + Insert | Insert on employee + Result | Result + Executor | ExecutorFinish + Insert | INSERT INTO Employee_Audit (EmployeeId, LastName, EmpAdditionTime) VALUES(NEW.EmployeeId,NEW.LastName,current_date) + Planner | Planner + Executor | ExecutorStart + Executor | ExecutorRun + Insert | Insert on employee_audit + Result | Result + Executor | ExecutorFinish + Executor | ExecutorEnd + Insert | INSERT INTO Employee_Audit (EmployeeId, LastName, EmpAdditionTime) VALUES(NEW.EmployeeId,NEW.LastName,current_date) + Planner | Planner + Executor | ExecutorStart + Executor | ExecutorRun + Insert | Insert on employee_audit + Result | Result + Executor | ExecutorFinish + Executor | ExecutorEnd + Executor | ExecutorEnd +(26 rows) + +-- Check count of query_id +SELECT count(distinct query_id) from pg_tracing_consume_spans where trace_id=1; + count +------- + 2 +(1 row) + diff --git a/contrib/pg_tracing/expected/utility.out b/contrib/pg_tracing/expected/utility.out new file mode 100644 index 0000000000..c248bfa84e --- /dev/null +++ b/contrib/pg_tracing/expected/utility.out @@ -0,0 +1,243 @@ +-- Some helper functions +CREATE OR REPLACE FUNCTION get_span_start_ns(time_start timestamptz, start_ns smallint) RETURNS bigint AS +$BODY$ +BEGIN + RETURN extract(epoch from time_start) * 1000000000 + start_ns; +END; +$BODY$ +LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION get_span_end_ns(time_start timestamptz, start_ns smallint, duration bigint) RETURNS bigint AS +$BODY$ +BEGIN + RETURN get_span_start_ns(time_start, start_ns) + duration; +END; +$BODY$ +LANGUAGE plpgsql; +-- Create pg_tracing extension with sampling on +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; +-- This will create the following spans (non exhaustive list): +-- +-- +-------------------------------------------------------------------------------------+ +-- | A: Utility: Create Extension | +-- +-+-----------------------------------------------------------------------------------+ +-- +----------------------------------------------------------------------------------+ +-- |B: ProcessUtility: Create Extension | +-- +---+-----------------------------------+---+--------------------------------------+ +-- +-----------------------------------+ +-------------------------------------+ +-- |C: Utility: Create Function1 | |E: Utility: Create Function2 | +-- ++----------------------------------+ ++-----------------------------------++ +-- +----------------------------------+ +-----------------------------------+ +-- |D: ProcessUtility: Create Function| |F: ProcessUtility: Create Function2| +-- +----------------------------------+ +-----------------------------------+ +-- Extract span_ids, start and end of those spans +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=1 and span_type='Utility' \gset +SELECT span_id AS span_b_id, + get_span_start_ns(span_start, span_start_ns) as span_b_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_b_end + from pg_tracing_peek_spans where parent_id=:span_a_id and span_type='ProcessUtility' \gset +SELECT span_id AS span_c_id, + get_span_start_ns(span_start, span_start_ns) as span_c_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_c_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Utility' limit 1 \gset +SELECT span_id AS span_d_id, + get_span_start_ns(span_start, span_start_ns) as span_d_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_d_end + from pg_tracing_peek_spans where parent_id=:span_c_id and span_type='ProcessUtility' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Utility' limit 1 offset 1 \gset +-- Check that the start and end of those spans are within expectation +SELECT :span_a_start < :span_b_start AS span_a_start_first, + :span_a_end >= :span_b_end AS span_a_end_last, + :span_d_end <= :span_c_end AS nested_span_ends_before_parent, + :span_c_end <= :span_e_start AS next_utility_start_after; + span_a_start_first | span_a_end_last | nested_span_ends_before_parent | next_utility_start_after +--------------------+-----------------+--------------------------------+-------------------------- + t | t | t | t +(1 row) + +-- Clean current spans +select count(*) from pg_tracing_consume_spans; + count +------- + 47 +(1 row) + +-- +-- Test that no utility is captured with track_utility off +-- +-- Set utility off +SET pg_tracing.track_utility = off; +-- Prepare and execute a prepared statement +PREPARE test_prepared_one_param (integer) AS SELECT $1; +EXECUTE test_prepared_one_param(100); + ?column? +---------- + 100 +(1 row) + +-- Nothing should be generated +select count(*) from pg_tracing_consume_spans; + count +------- + 0 +(1 row) + +-- Force a query to start from ExecutorRun +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + ?column? +---------- + 200 +(1 row) + +-- Again, nothing should be generated +select count(*) from pg_tracing_consume_spans; + count +------- + 0 +(1 row) + +-- +-- Test that no utility is captured with track_utility off +-- +-- Enable utility tracking and track everything +SET pg_tracing.track_utility = on; +SET pg_tracing.sample_rate = 1.0; +-- Prepare and execute a prepared statement +PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; +EXECUTE test_prepared_one_param_2(100); + ?column? +---------- + 100 +(1 row) + +-- Check the number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; + count +------- + 2 +(1 row) + +-- Check that the spans are in correct order +select span_operation, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; + span_operation | parameters +-----------------------------------------------------------+------------ + Parse | + PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | + Post Parse | + Parse | + ProcessUtility | + PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | + Post Parse | + Parse | + Post Parse | + EXECUTE test_prepared_one_param_2(100); | + ProcessUtility | + PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | + Planner | + ExecutorStart | + ExecutorRun | + Result | + ExecutorFinish | + ExecutorEnd | +(18 rows) + +-- Check the top span (standalone top span has trace_id=parent_id) +select span_operation, parameters from pg_tracing_consume_spans where trace_id = parent_id order by span_start, span_start_ns; + span_operation | parameters +-----------------------------------------------------------+------------ + Parse | + PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | + Parse | + EXECUTE test_prepared_one_param_2(100); | +(4 rows) + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + ?column? +---------- + 200 +(1 row) + +-- Check the number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; + count +------- + 2 +(1 row) + +-- Check that the spans are in correct order +select span_operation, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; + span_operation | parameters +---------------------------------------------------------+------------ + Parse | + SET plan_cache_mode='force_generic_plan'; | + Post Parse | + ProcessUtility | + Parse | + EXECUTE test_prepared_one_param(200); | + Post Parse | + ProcessUtility | + PREPARE test_prepared_one_param (integer) AS SELECT $1; | + ExecutorStart | + ExecutorRun | + Result | + ExecutorFinish | + ExecutorEnd | +(14 rows) + +-- Check the top span (standalone top span has trace_id=parent_id) +select span_operation, parameters from pg_tracing_consume_spans where trace_id = parent_id order by span_start, span_start_ns; + span_operation | parameters +-------------------------------------------+------------ + Parse | + SET plan_cache_mode='force_generic_plan'; | + Parse | + EXECUTE test_prepared_one_param(200); | +(4 rows) + +-- Second create extension should generate an error that is captured by span +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; +ERROR: extension "pg_tracing" already exists +select span_operation, parameters, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns; + span_operation | parameters | sql_error_code +------------------------------+------------+---------------- + Parse | | 00000 + CREATE EXTENSION pg_tracing; | | 42710 + Post Parse | | 00000 + ProcessUtility | | 42710 +(4 rows) + +-- Create test table +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE TABLE pg_tracing_test (a int, b char(20)); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ CREATE INDEX pg_tracing_index ON pg_tracing_test (a); +-- Check create table and index spans +select trace_id, span_type, span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, duration desc; + trace_id | span_type | span_operation +----------+----------------+------------------------------------------------------- + 1 | Parse | Parse + 1 | Utility | CREATE TABLE pg_tracing_test (a int, b char(20)); + 1 | Post Parse | Post Parse + 1 | ProcessUtility | ProcessUtility + 2 | Parse | Parse + 2 | Utility | CREATE INDEX pg_tracing_index ON pg_tracing_test (a); + 2 | Post Parse | Post Parse + 2 | ProcessUtility | ProcessUtility +(8 rows) + +-- Cleanup +SET plan_cache_mode='auto'; +CREATE OR REPLACE FUNCTION lazy_function(IN anyarray, OUT x anyelement, OUT n int) + RETURNS SETOF RECORD + LANGUAGE sql STRICT IMMUTABLE PARALLEL SAFE + AS 'select s from pg_catalog.generate_series(1, 1, 1) as g(s)'; +select lazy_function('{1,2,3}'::int[]); +ERROR: return type mismatch in function declared to return record +DETAIL: Final statement returns too few columns. +CONTEXT: SQL function "lazy_function" during startup diff --git a/contrib/pg_tracing/expected/wal.out b/contrib/pg_tracing/expected/wal.out new file mode 100644 index 0000000000..2f405d9b49 --- /dev/null +++ b/contrib/pg_tracing/expected/wal.out @@ -0,0 +1,97 @@ +-- Enable wal instrumentation +set pg_tracing.instrument_wal = true; +-- Generate queries with wal write +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; +-- Check WAL is generated for the above statements +SELECT trace_id, span_type, span_operation, deparse_info, + wal_records > 0 as wal_records, + wal_bytes > 0 as wal_bytes +FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + trace_id | span_type | span_operation | deparse_info | wal_records | wal_bytes +----------+-----------------+------------------------------------------------------------------+-----------------------+-------------+----------- + 1 | Parse | Parse | | | + 1 | Insert | INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | | t | t + 1 | Post Parse | Post Parse | | | + 1 | Planner | Planner | | f | f + 1 | Executor | ExecutorStart | | | + 1 | Executor | ExecutorRun | | | + 1 | Insert | Insert on pg_tracing_test | | t | t + 1 | ProjectSet | ProjectSet | | f | f + 1 | Result | Result | | f | f + 1 | Executor | ExecutorFinish | | | + 1 | Executor | ExecutorEnd | | | + 2 | Parse | Parse | | | + 2 | Post Parse | Post Parse | | | + 2 | Update | UPDATE pg_tracing_test SET b = $1 WHERE a > $2; | | t | t + 2 | Planner | Planner | | f | f + 2 | Executor | ExecutorStart | | | + 2 | Executor | ExecutorRun | | | + 2 | Update | Update on pg_tracing_test | | t | t + 2 | BitmapHeapScan | BitmapHeapScan on pg_tracing_test | Recheck Cond: (a > 7) | f | f + 2 | BitmapIndexScan | BitmapIndexScan on pg_tracing_index | Index Cond: (a > 7) | f | f + 2 | Executor | ExecutorFinish | | | + 2 | Executor | ExecutorEnd | | | + 3 | Parse | Parse | | | + 3 | Delete | DELETE FROM pg_tracing_test WHERE a > $1; | | t | t + 3 | Post Parse | Post Parse | | | + 3 | Planner | Planner | | f | f + 3 | Executor | ExecutorStart | | | + 3 | Executor | ExecutorRun | | | + 3 | Delete | Delete on pg_tracing_test | | t | t + 3 | BitmapHeapScan | BitmapHeapScan on pg_tracing_test | Recheck Cond: (a > 9) | f | f + 3 | BitmapIndexScan | BitmapIndexScan on pg_tracing_index | Index Cond: (a > 9) | f | f + 3 | Executor | ExecutorFinish | | | + 3 | Executor | ExecutorEnd | | | +(33 rows) + +-- Redo the same but without wal instrumentation +set pg_tracing.instrument_wal = false; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; +-- With wal instrumentation disabled, wal counters should be set to 0 +SELECT trace_id, span_type, span_operation, deparse_info, + wal_records = 0 as wal_records, + wal_bytes = 0 as wal_bytes +FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + trace_id | span_type | span_operation | deparse_info | wal_records | wal_bytes +----------+-----------------+------------------------------------------------------------------+-----------------------+-------------+----------- + 1 | Parse | Parse | | | + 1 | Insert | INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | | t | t + 1 | Post Parse | Post Parse | | | + 1 | Planner | Planner | | t | t + 1 | Executor | ExecutorStart | | | + 1 | Executor | ExecutorRun | | | + 1 | Insert | Insert on pg_tracing_test | | t | t + 1 | ProjectSet | ProjectSet | | t | t + 1 | Result | Result | | t | t + 1 | Executor | ExecutorFinish | | | + 1 | Executor | ExecutorEnd | | | + 2 | Parse | Parse | | | + 2 | Post Parse | Post Parse | | | + 2 | Update | UPDATE pg_tracing_test SET b = $1 WHERE a > $2; | | t | t + 2 | Planner | Planner | | t | t + 2 | Executor | ExecutorStart | | | + 2 | Executor | ExecutorRun | | | + 2 | Update | Update on pg_tracing_test | | t | t + 2 | BitmapHeapScan | BitmapHeapScan on pg_tracing_test | Recheck Cond: (a > 7) | t | t + 2 | BitmapIndexScan | BitmapIndexScan on pg_tracing_index | Index Cond: (a > 7) | t | t + 2 | Executor | ExecutorFinish | | | + 2 | Executor | ExecutorEnd | | | + 3 | Parse | Parse | | | + 3 | Delete | DELETE FROM pg_tracing_test WHERE a > $1; | | t | t + 3 | Post Parse | Post Parse | | | + 3 | Planner | Planner | | t | t + 3 | Executor | ExecutorStart | | | + 3 | Executor | ExecutorRun | | | + 3 | Delete | Delete on pg_tracing_test | | t | t + 3 | BitmapHeapScan | BitmapHeapScan on pg_tracing_test | Recheck Cond: (a > 9) | t | t + 3 | BitmapIndexScan | BitmapIndexScan on pg_tracing_index | Index Cond: (a > 9) | t | t + 3 | Executor | ExecutorFinish | | | + 3 | Executor | ExecutorEnd | | | +(33 rows) + +-- Cleanup +set pg_tracing.instrument_wal = true; diff --git a/contrib/pg_tracing/meson.build b/contrib/pg_tracing/meson.build new file mode 100644 index 0000000000..ca28d579b3 --- /dev/null +++ b/contrib/pg_tracing/meson.build @@ -0,0 +1,56 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +pg_tracing_sources = files( + 'pg_tracing.c', + 'pg_tracing_query_process.c', + 'pg_tracing_explain.c', + 'pg_tracing_parallel.c', + 'pg_tracing_span.c', +) + +if host_system == 'windows' + pg_tracing_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_tracing', + '--FILEDESC', 'pg_tracing - Add support for distributed tracing',]) +endif + +pg_tracing = shared_module('pg_tracing', + pg_tracing_sources, + kwargs: contrib_mod_args + { + 'dependencies': contrib_mod_args['dependencies'], + }, +) +contrib_targets += pg_tracing + +install_data( + 'pg_tracing.control', + 'pg_tracing--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_tracing', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'utility', + 'select', + 'extended', + 'insert', + 'trigger', + 'sample', + 'subxact', + 'reset', + 'nested', + 'wal', + 'parallel', + 'cleanup' + ], + 'regress_args': ['--temp-config', files('pg_tracing.conf')], + # Disabled because these tests require + # "shared_preload_libraries=pg_tracing", which typical + # runningcheck users do not have (e.g. buildfarm clients). + 'runningcheck': false, + } +} diff --git a/contrib/pg_tracing/pg_tracing--1.0.sql b/contrib/pg_tracing/pg_tracing--1.0.sql new file mode 100644 index 0000000000..0970a403e8 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing--1.0.sql @@ -0,0 +1,102 @@ +/* contrib/pg_tracing/pg_tracing--1.0.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "CREATE EXTENSION pg_tracing" to load this file. \quit + +--- Define pg_tracing_info +CREATE FUNCTION pg_tracing_info( + OUT traces bigint, + OUT spans bigint, + OUT dropped_spans bigint, + OUT failed_truncates bigint, + OUT last_consume timestamp with time zone, + OUT stats_reset timestamp with time zone +) +RETURNS record +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT VOLATILE PARALLEL SAFE; + +CREATE FUNCTION pg_tracing_reset() +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C PARALLEL SAFE; + +CREATE FUNCTION pg_tracing_spans( + IN consume bool, + OUT trace_id bigint, + OUT parent_id bigint, + OUT span_id bigint, + OUT query_id bigint, + OUT span_type text, + OUT span_operation text, + OUT span_start timestamp with time zone, + OUT span_start_ns smallint, + OUT duration bigint, + OUT sql_error_code character(5), + OUT userid oid, + OUT dbid oid, + OUT pid int4, + OUT subxact_count smallint, + +-- Plan counters + OUT plan_startup_cost float8, + OUT plan_total_cost float8, + OUT plan_rows float8, + OUT plan_width int, + +-- Node Counters + OUT rows int8, + OUT nloops int8, + + OUT shared_blks_hit int8, + OUT shared_blks_read int8, + OUT shared_blks_dirtied int8, + OUT shared_blks_written int8, + + OUT local_blks_hit int8, + OUT local_blks_read int8, + OUT local_blks_dirtied int8, + OUT local_blks_written int8, + + OUT blk_read_time float8, + OUT blk_write_time float8, + + OUT temp_blks_read int8, + OUT temp_blks_written int8, + OUT temp_blk_read_time float8, + OUT temp_blk_write_time float8, + + OUT wal_records int8, + OUT wal_fpi int8, + OUT wal_bytes numeric, + + OUT jit_functions int8, + OUT jit_generation_time float8, + OUT jit_inlining_time float8, + OUT jit_optimization_time float8, + OUT jit_emission_time float8, + +-- Span node specific data + OUT startup bigint, -- First tuple + OUT parameters text, + OUT deparse_info text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C PARALLEL SAFE; + +CREATE VIEW pg_tracing_info AS + SELECT * FROM pg_tracing_info(); + +CREATE VIEW pg_tracing_peek_spans AS + SELECT * FROM pg_tracing_spans(false); + +CREATE VIEW pg_tracing_consume_spans AS + SELECT * FROM pg_tracing_spans(true); + +GRANT SELECT ON pg_tracing_info TO PUBLIC; +GRANT SELECT ON pg_tracing_peek_spans TO pg_read_all_stats; +GRANT SELECT ON pg_tracing_consume_spans TO pg_read_all_stats; + +-- Don't want this to be available to non-superusers. +REVOKE EXECUTE ON FUNCTION pg_tracing_reset() FROM PUBLIC; diff --git a/contrib/pg_tracing/pg_tracing.c b/contrib/pg_tracing/pg_tracing.c new file mode 100644 index 0000000000..617a418bc8 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.c @@ -0,0 +1,2560 @@ +/*------------------------------------------------------------------------- + * + * pg_tracing.c + * Generate spans for distributed tracing from SQL query + * + * Spans will only be generated for sampled queries. A query is sampled if: + * - It has a tracecontext propagated throught SQLCommenter and it passes the caller_sample_rate. + * - It has no SQLCommenter but the query randomly passes the global sample_rate + * + * A query with sqlcommenter will look like: /\*dddbs='postgres.db',traceparent='00-00000000000000000000000000000009-0000000000000005-01'*\/ select 1; + * The traceparent fields are detailed in https://www.w3.org/TR/trace-context/#traceparent-header-field-values + * 00000000000000000000000000000009: trace id + * 0000000000000005: parent id + * 01: trace flags (01 == sampled) + * + * If sampled, we will generate spans for the current statement. + * A span represents an operation with a start time, a duration and useful metadatas (block stats, wal stats...). + * We will track the following operations: + * - Top Span: The top span for a statement. They are created after extracting the traceid from traceparent or to represent a nested query. + * - Planner: We track the time spent in the planner and report the planner counters + * - Node Span: Created from planstate. The name is extracted from the node type (IndexScan, SeqScan). + * - Executor: We trace the different steps of the Executor: Start, Run, Finish and End + * + * A typical traced query will generate the following spans: + * +------------------------------------------------------------------------------------------------------------------------------------------------------+ + * | Type: Select (top span) | + * | Operation: Select * pgbench_accounts WHERE aid=$1; | + * +---+------------------------+-+------------------+--+------------------------------------------------------+-+--------------------+-+----------------++ + * | Type: Planner | | Type: Executor | |Type: Executor | | Type: Executor | | Type: Executor | + * | Operation: Planner | | Operation: Start | |Operation: Run | | Operation: Finish | | Operation: End | + * +------------------------+ +------------------+ +--+--------------------------------------------------++ +--------------------+ +----------------+ + * | Type: IndexScan | + * | Operation: IndexScan using pgbench_accounts_pkey | + * | on pgbench_accounts | + * +--------------------------------------------------+ + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_tracing.h" +#include "pg_tracing_explain.h" +#include "pg_tracing_parallel.h" +#include "pg_tracing_query_process.h" +#include "pg_tracing_span.h" + +#include "access/xact.h" +#include "common/pg_prng.h" +#include "commands/async.h" +#include "funcapi.h" +#include "nodes/extensible.h" +#include "optimizer/planner.h" +#include "parser/analyze.h" +#include "storage/ipc.h" +#include "tcop/utility.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/ruleutils.h" +#include "utils/xid8.h" + +PG_MODULE_MAGIC; + +typedef enum +{ + PG_TRACING_TRACK_NONE, /* track no statements */ + PG_TRACING_TRACK_TOP, /* only top level statements */ + PG_TRACING_TRACK_ALL /* all statements, including nested ones */ +} pgTracingTrackLevel; + +/* + * Structure to store flexible array of spans + */ +typedef struct pgTracingSpans +{ + int end; /* Index of last element */ + int max; /* Maximum number of element */ + Span spans[FLEXIBLE_ARRAY_MEMBER]; +} pgTracingSpans; + +/* + * A span with it's position in the shared buffer + */ +typedef struct pgTracingStoredSpan +{ + int index; /* Position of the span in the + * current_trace_spans buffer */ + Span *span; /* Pointer to the span. Might be invalid if + * current_trace_spans was repalloced */ +} pgTracingStoredSpan; + +/* GUC variables */ +static int pg_tracing_max_span; /* Maximum number of spans to store */ +static int pg_tracing_max_parameter_str; /* Maximum number of spans to + * store */ +static bool pg_tracing_trace_parallel_workers = true; /* True to generate + * spans from parallel + * workers */ +static int pg_tracing_initial_allocated_spans = 25; /* Number of spans + * allocated at the + * start of a trace at + * the same time. */ +static bool pg_tracing_instrument_buffers; /* Enable buffers instrumentation */ +static bool pg_tracing_instrument_wal; /* Enable WAL instrumentation */ +static double pg_tracing_sample_rate = 0; /* Sample rate applied to queries + * without SQLCommenter */ +static double pg_tracing_caller_sample_rate = 1; /* Sample rate applied to + * queries with + * SQLCommenter */ +static bool pg_tracing_export_parameters = true; /* Export query's + * parameters as span + * metadata */ +static bool pg_tracing_deparse_plan = true; /* Deparse plan to generate more + * detailed spans */ +static int pg_tracing_track = PG_TRACING_TRACK_ALL; /* tracking level */ +static bool pg_tracing_track_utility = true; /* whether to track utility + * commands */ +static bool pg_tracing_drop_full_buffer = true; /* whether to drop buffered + * spans when full */ +static char *pg_tracing_notify_channel = NULL; /* Name of the channel to + * notify when span buffer + * exceeds a provided + * threshold */ +static double pg_tracing_notify_threshold = 0.8; /* threshold for span + * buffer usage + * notification */ + +static const struct config_enum_entry track_options[] = +{ + {"none", PG_TRACING_TRACK_NONE, false}, + {"top", PG_TRACING_TRACK_TOP, false}, + {"all", PG_TRACING_TRACK_ALL, false}, + {NULL, 0, false} +}; + +#define pg_tracking_level(level) \ + ((pg_tracing_track == PG_TRACING_TRACK_ALL || \ + (pg_tracing_track == PG_TRACING_TRACK_TOP && (level) == 0))) + +#define pg_tracing_enabled(trace_context, level) \ + (trace_context->sampled && pg_tracking_level(level)) + +PG_FUNCTION_INFO_V1(pg_tracing_info); +PG_FUNCTION_INFO_V1(pg_tracing_spans); +PG_FUNCTION_INFO_V1(pg_tracing_reset); + +/* + * Global variables + */ + +/* Memory context for pg_tracing. */ +static MemoryContext pg_tracing_mem_ctx; + +/* trace context at the root level of parse/planning hook */ +static struct pgTracingTraceContext root_trace_context; + +/* trace context used in nested levels or within executor hooks */ +static struct pgTracingTraceContext current_trace_context; + +/* Latest trace id observed */ +uint64 latest_trace_id; + +/* Latest local transaction id traced */ +LocalTransactionId latest_lxid = InvalidLocalTransactionId; + +/* Shared state with stats and file external state */ +static pgTracingSharedState * pg_tracing = NULL; + +/* + * Shared buffer storing spans. Query with sampled flag will add new spans to + * the shared state at the end of the traced query. Those spans will be consumed during calls to + * pg_tracing_spans + */ +static pgTracingSpans * shared_spans = NULL; + +/* + * Store all ongoing spans for the curent trace. + * They will be added to the shared buffer at the end of query tracing. + * + * This buffer can be dynamically resized so care needs to be taken when using span pointers. + */ +static pgTracingSpans * current_trace_spans; + +/* +* Text for spans are buffered in this +* stringinfo and written at the end of the tracing in a single write. +*/ +static StringInfo current_trace_text; + +/* + * Maximum nested level for a query to know how many top spans we need to + * copy in shared_spans + */ +static int max_nested_level = -1; + +/* Current nesting depth of planner calls */ +static int plan_nested_level = 0; + +/* Current nesting depth of ExecutorRun+ProcessUtility calls */ +static int exec_nested_level = 0; + +/* Previous nested level observed during post parse */ +static int previous_parse_nested_level = 0; + +/* Number of allocated levels. */ +static int allocated_nested_level = 0; + +/* Possible start for a nested span */ +static int64 nested_query_start_ns = 0; + +/* Per nested level buffers */ +typedef struct pgTracingPerLevelBuffer +{ + int top_span_index; /* Index of the top span in the + * current_trace_spans buffer */ + int parse_span_index; /* Index of the parse span for this level */ + uint64 executor_run_id; /* Span id of executor run spans by nested + * level Executor run is used as parent + * for spans generated from planstate */ + uint64 query_id; /* Query id by nested level when available */ +} pgTracingPerLevelBuffer; + +static pgTracingPerLevelBuffer * per_level_buffers; + +/* Timestamp of the latest statement checked for sampling. */ +static TimestampTz last_statement_check_for_sampling = 0; + +static void pg_tracing_shmem_request(void); +static void pg_tracing_shmem_startup(void); + +/* Saved hook values in case of unload */ +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL; +static planner_hook_type prev_planner_hook = NULL; +static ExecutorStart_hook_type prev_ExecutorStart = NULL; +static ExecutorRun_hook_type prev_ExecutorRun = NULL; +static ExecutorFinish_hook_type prev_ExecutorFinish = NULL; +static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; +static ProcessUtility_hook_type prev_ProcessUtility = NULL; + +static void pg_tracing_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate); +static PlannedStmt *pg_tracing_planner_hook(Query *parse, + const char *query_string, + int cursorOptions, + ParamListInfo params); +static void pg_tracing_ExecutorStart(QueryDesc *queryDesc, int eflags); +static void pg_tracing_ExecutorRun(QueryDesc *queryDesc, + ScanDirection direction, + uint64 count, bool execute_once); +static void pg_tracing_ExecutorFinish(QueryDesc *queryDesc); +static void pg_tracing_ExecutorEnd(QueryDesc *queryDesc); +static void pg_tracing_ProcessUtility(PlannedStmt *pstmt, const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, QueryCompletion *qc); + +static void generate_member_nodes(PlanState **planstates, int nplans, planstateTraceContext * planstateTraceContext, uint64 parent_id); +static Span * generate_span_from_planstate(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id); +static pgTracingStats get_empty_pg_tracing_stats(void); + +/* + * Module load callback + */ +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + DefineCustomIntVariable("pg_tracing.max_span", + "Maximum number of spans stored in shared memory.", + NULL, + &pg_tracing_max_span, + 5000, + 0, + 500000, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_tracing.max_parameter_size", + "Maximum size of parameters. -1 to disable parameter in top span.", + NULL, + &pg_tracing_max_parameter_str, + 1024, + 0, + 10000, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.instrument_buffers", + "Instrument and add buffers usage in spans.", + NULL, + &pg_tracing_instrument_buffers, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.instrument_wal", + "Instrument and add WAL usage in spans.", + NULL, + &pg_tracing_instrument_wal, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.trace_parallel_workers", + "Whether to generate samples from parallel workers.", + NULL, + &pg_tracing_trace_parallel_workers, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_tracing.initial_allocated_spans", + "The number of allocated spans at the start of a trace. A higher number can help avoid reallocation when extending the buffer.", + NULL, + &pg_tracing_initial_allocated_spans, + 25, + 0, + 500, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomEnumVariable("pg_tracing.track", + "Selects which statements are tracked by pg_tracing.", + NULL, + &pg_tracing_track, + PG_TRACING_TRACK_ALL, + track_options, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.track_utility", + "Selects whether utility commands are tracked by pg_tracing.", + NULL, + &pg_tracing_track_utility, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.drop_on_full_buffer", + "Selects whether buffered spans should be dropped when buffer is full.", + NULL, + &pg_tracing_drop_full_buffer, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomRealVariable("pg_tracing.sample_rate", + "Fraction of queries to process.", + NULL, + &pg_tracing_sample_rate, + 0.0, + 0.0, + 1.0, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomRealVariable("pg_tracing.caller_sample_rate", + "Fraction of queries with sampled flag to process.", + NULL, + &pg_tracing_caller_sample_rate, + 1.0, + 0.0, + 1.0, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.export_parameters", + "Export query's parameters as span metadata.", + NULL, + &pg_tracing_export_parameters, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_tracing.deparse_plan", + "Deparse query plan to generate details more details on a plan node.", + NULL, + &pg_tracing_deparse_plan, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomStringVariable("pg_tracing.notify_channel", + "Name of the channel to notify when span buffer reaches a provided threshold.", + NULL, + &pg_tracing_notify_channel, + NULL, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomRealVariable("pg_tracing.notify_threshold", + "When span buffer exceeds this threshold, a notification will be sent.", + NULL, + &pg_tracing_notify_threshold, + 0.8, + 0.0, + 1.0, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + MarkGUCPrefixReserved("pg_tracing"); + + /* For jumble state */ + EnableQueryId(); + + /* Install hooks. */ + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = pg_tracing_shmem_request; + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = pg_tracing_shmem_startup; + + prev_post_parse_analyze_hook = post_parse_analyze_hook; + post_parse_analyze_hook = pg_tracing_post_parse_analyze; + + prev_planner_hook = planner_hook; + planner_hook = pg_tracing_planner_hook; + + prev_ExecutorStart = ExecutorStart_hook; + ExecutorStart_hook = pg_tracing_ExecutorStart; + prev_ExecutorRun = ExecutorRun_hook; + ExecutorRun_hook = pg_tracing_ExecutorRun; + prev_ExecutorFinish = ExecutorFinish_hook; + ExecutorFinish_hook = pg_tracing_ExecutorFinish; + prev_ExecutorEnd = ExecutorEnd_hook; + ExecutorEnd_hook = pg_tracing_ExecutorEnd; + + prev_ProcessUtility = ProcessUtility_hook; + ProcessUtility_hook = pg_tracing_ProcessUtility; +} + +/* + * Estimate shared memory space needed. + */ +static Size +pg_tracing_memsize(void) +{ + Size size; + + /* pg_tracing shared state */ + size = sizeof(pgTracingSharedState); + /* span struct */ + size = add_size(size, sizeof(pgTracingSpans)); + /* the span variable array */ + size = add_size(size, mul_size(pg_tracing_max_span, sizeof(Span))); + /* and the parallel workers context */ + size = add_size(size, mul_size(max_worker_processes, sizeof(pgTracingParallelContext))); + + return size; +} + +/* + * shmem_request hook: request additional shared resources. We'll allocate + * or attach to the shared resources in pgss_shmem_startup(). + */ +static void +pg_tracing_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + RequestAddinShmemSpace(pg_tracing_memsize()); +} + +/* + * Allocate a new span in the current_trace_spans buffer. + * current_trace_spans may be extended if needed. + */ +static pgTracingStoredSpan +get_new_stored_span(void) +{ + pgTracingStoredSpan result; + + if (current_trace_spans->end + 1 >= current_trace_spans->max) + { + MemoryContext oldcxt; + + /* Need to extend. */ + /* TODO: Have a configurable limit? */ + current_trace_spans->max *= 2; + oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx); + current_trace_spans = repalloc(current_trace_spans, current_trace_spans->max * sizeof(Span)); + MemoryContextSwitchTo(oldcxt); + } + result.index = current_trace_spans->end++; + result.span = ¤t_trace_spans->spans[result.index]; + return result; +} + +/* + * Get span from the provided span index + */ +static pgTracingStoredSpan +get_span_from_index(int span_index) +{ + pgTracingStoredSpan stored_span; + + Assert(span_index < current_trace_spans->end); + Assert(span_index != -1); + stored_span.index = span_index; + stored_span.span = ¤t_trace_spans->spans[span_index]; + return stored_span; +} + +/* + * Get the span index of the top span for a specific nested level + */ +static pgTracingStoredSpan +get_top_span_for_nested_level(const pgTracingTraceContext * trace_context, int nested_level) +{ + Assert(nested_level <= max_nested_level); + Assert(nested_level >= 0); + /* For the root level, the top span is stored in the trace context */ + if (nested_level == 0) + return get_span_from_index(trace_context->root_span_index); + return get_span_from_index(per_level_buffers[nested_level].top_span_index); +} + +/* + * Set top span index to the provided span index + */ +static void +set_top_span_for_nested_level(pgTracingTraceContext * trace_context, int nested_level, int top_span_index) +{ + Assert(nested_level <= max_nested_level); + Assert(top_span_index < current_trace_spans->max); + Assert(top_span_index >= 0); + if (nested_level == 0) + trace_context->root_span_index = top_span_index; + else + per_level_buffers[nested_level].top_span_index = top_span_index; +} + +/* + * End the span at the provided index + */ +static void +end_span_index(int span_index, const int64 *end_span_input) +{ + pgTracingStoredSpan stored_span; + + if (span_index < 0) + return; + stored_span = get_span_from_index(span_index); + end_span(stored_span.span, end_span_input); +} + +/* + * End top span for the provided nested level + */ +static void +end_top_span(const pgTracingTraceContext * trace_context, const int64 *end_span_ns) +{ + pgTracingStoredSpan top_span; + + if (exec_nested_level > max_nested_level) + return; + top_span = get_top_span_for_nested_level(trace_context, exec_nested_level); + if (top_span.index != -1 && !top_span.span->ended) + end_span(top_span.span, end_span_ns); +} + +/* + * Allocate a new span and set its start and end + */ +static int +create_full_span(const pgTracingTraceContext * trace_context, SpanType type, + uint64 parent_id, int64 start_span_ns, const int64 *end_span_ns) +{ + pgTracingStoredSpan stored_span = get_new_stored_span(); + + begin_span(trace_context, stored_span.span, type, parent_id, + per_level_buffers[exec_nested_level].query_id, + &start_span_ns); + end_span(stored_span.span, end_span_ns); + return stored_span.index; +} + +/* + * Add span to the shared memory. Mutex must be acquired beforehand. + */ +static void +add_span_to_shared_buffer_locked(const Span * span) +{ + /* Spans must be ended before adding them to the shared buffer */ + Assert(span->ended); + if (shared_spans->end + 1 >= shared_spans->max) + pg_tracing->stats.dropped_spans++; + else + { + pg_tracing->stats.spans++; + shared_spans->spans[shared_spans->end++] = *span; + } +} + +/* + * Check if we still have available space in the shared spans. + * + * Between the moment we check and the moment we want to insert, the buffer + * may be full but we redo a check before appending the span. This is done + * early when starting a top span to bail out early if the buffer is + * already full since we don't immediately add the span in the shared buffer. + */ +static bool +check_full_shared_spans() +{ + if (shared_spans->end + 1 >= shared_spans->max) + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + bool full_buffer = false; + + SpinLockAcquire(&s->mutex); + if (pg_tracing_drop_full_buffer) + { + s->stats.dropped_spans += shared_spans->end; + shared_spans->end = 0; + s->extent = 0; + } + else + { + full_buffer = true; + s->stats.dropped_spans++; + } + SpinLockRelease(&s->mutex); + return full_buffer; + } + return false; +} + +/* + * Reset trace_context fields + */ +static void +reset_trace_context(pgTracingTraceContext * trace_context) +{ + trace_context->sampled = 0; + trace_context->trace_id = 0; + trace_context->parent_id = 0; + trace_context->root_span_index = -1; + trace_context->start_trace.ts = 0; +} + +/* + * shmem_startup hook: allocate or attach to shared memory, Also create and + * load the query-texts file, which is expected to exist (even if empty) + * while the module is enabled. + */ +static void +pg_tracing_shmem_startup(void) +{ + bool found_pg_tracing; + bool found_shared_spans; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + /* reset in case this is a restart within the postmaster */ + pg_tracing = NULL; + + /* Create or attach to the shared memory state */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + reset_trace_context(&root_trace_context); + reset_trace_context(¤t_trace_context); + pg_tracing = ShmemInitStruct("PgTracing Shared", sizeof(pgTracingSharedState), &found_pg_tracing); + shared_spans = ShmemInitStruct("PgTracing Spans", + sizeof(pgTracingSpans) + pg_tracing_max_span * sizeof(Span), + &found_shared_spans); + + /* Initialize pg_tracing memory context */ + pg_tracing_mem_ctx = AllocSetContextCreate(TopMemoryContext, + "pg_tracing memory context", + ALLOCSET_DEFAULT_SIZES); + + /* Initialize shmem for trace propagation to parallel workers */ + pg_tracing_shmem_parallel_startup(); + + /* First time, let's init shared state */ + if (!found_pg_tracing) + { + pg_tracing->stats = get_empty_pg_tracing_stats(); + SpinLockInit(&pg_tracing->mutex); + } + if (!found_shared_spans) + { + shared_spans->end = 0; + shared_spans->max = pg_tracing_max_span; + } + LWLockRelease(AddinShmemInitLock); +} + +/* + * Process a query descriptor: Gather all query instrumentation in the query + * span counters and generate span nodes from queryDesc planstate + */ +static void +process_query_desc(const pgTracingTraceContext * trace_context, QueryDesc *queryDesc, int sql_error_code) +{ + NodeCounters *node_counters = &get_top_span_for_nested_level(trace_context, exec_nested_level).span->node_counters; + + /* Process total counters */ + if (queryDesc->totaltime) + { + InstrEndLoop(queryDesc->totaltime); + node_counters->buffer_usage = queryDesc->totaltime->bufusage; + node_counters->wal_usage = queryDesc->totaltime->walusage; + } + /* Process JIT counter */ + if (queryDesc->estate->es_jit) + node_counters->jit_usage = queryDesc->estate->es_jit->instr; + node_counters->rows = queryDesc->estate->es_total_processed; + + /* Process planstate */ + if (queryDesc->planstate) + { + Bitmapset *rels_used = NULL; + planstateTraceContext planstateTraceContext; + uint64 parent_id = per_level_buffers[exec_nested_level].executor_run_id; + + planstateTraceContext.rtable_names = select_rtable_names_for_explain(queryDesc->plannedstmt->rtable, rels_used); + planstateTraceContext.trace_context = trace_context; + planstateTraceContext.ancestors = NULL; + planstateTraceContext.sql_error_code = sql_error_code; + /* Prepare the planstate context for deparsing */ + planstateTraceContext.deparse_ctx = NULL; + if (pg_tracing_deparse_plan) + planstateTraceContext.deparse_ctx = deparse_context_for_plan_tree(queryDesc->plannedstmt, planstateTraceContext.rtable_names); + + generate_span_from_planstate(queryDesc->planstate, &planstateTraceContext, parent_id); + } +} + +/* + * Create a trace id for the trace context if there's none. + * If trace was started from the global sample rate without + * a parent trace, we need to generate a random trace id. + */ +static void +set_trace_id(struct pgTracingTraceContext *trace_context) +{ + if (trace_context->trace_id != 0) + { + /* Update last lxid seen */ + latest_lxid = MyProc->lxid; + return; + } + + /* + * We want to keep the same trace id for all statements of the same + * transaction. For that, we check if we're in the same local xid. + */ + if (MyProc->lxid == latest_lxid) + { + trace_context->trace_id = latest_trace_id; + return; + } + + /* + * We leave parent_id to 0 as a way to indicate that this is a standalone + * trace. + */ + Assert(trace_context->parent_id == 0); + trace_context->trace_id = pg_prng_uint64(&pg_global_prng_state); + latest_trace_id = trace_context->trace_id; + latest_lxid = MyProc->lxid; +} + +/* + * Decide whether a query should be sampled depending on the traceparent sampled + * and the provided sample rate configurations + */ +static bool +apply_sample_rate(const struct pgTracingTraceContext *trace_context) +{ + double rand; + bool sampled; + + /* Everything is sampled */ + if (pg_tracing_sample_rate >= 1.0) + return true; + + /* No SQLCommenter sampled and no global sample rate */ + if (!trace_context->sampled && pg_tracing_sample_rate == 0.0) + return false; + + /* SQLCommenter sampled caller sample */ + if (trace_context->sampled && pg_tracing_caller_sample_rate >= 1.0) + return true; + + /* + * We have a non zero global sample rate or a sampled flag. Either way, we + * need a rand value + */ + rand = pg_prng_double(&pg_global_prng_state); + if (trace_context->sampled) + /* Sampled flag case */ + sampled = (rand < pg_tracing_caller_sample_rate); + else + /* Global rate case */ + sampled = (rand < pg_tracing_sample_rate); + return sampled; +} + +/* + * If the query was started as a prepared statement, we won't be able to + * extract traceparent during query parsing since parsing was skipped. + * + * We assume that SQLCommenter content can be passed as a text in the + * first parameter and try to extract the trace context from this first parameter. + */ +static void +extract_trace_context_from_parameter(struct pgTracingTraceContext *trace_context, ParamListInfo params) +{ + Oid typoutput; + bool typisvarlena; + char *pstring; + ParamExternData param; + + if (params == NULL || params->numParams == 0) + return; + + param = params->params[0]; + /* We only check the first parameter and if it's a text parameter */ + if (param.ptype != TEXTOID) + return; + + /* Get the text of the first string parameter */ + getTypeOutputInfo(param.ptype, &typoutput, &typisvarlena); + pstring = OidOutputFunctionCall(typoutput, param.value); + extract_trace_context_from_query(trace_context, pstring, true); +} + +/* + * Extract trace_context from either the query text or parameters. + * Sampling rate will be applied to the trace_context. + */ +static void +extract_trace_context(struct pgTracingTraceContext *trace_context, ParseState *pstate, const ParamListInfo params) +{ + TimestampTz statement_start_ts; + + /* Safety check... */ + if (!pg_tracing || !pg_tracking_level(exec_nested_level)) + return; + + /* sampling already started */ + if (trace_context->sampled) + return; + + /* Don't start tracing if we're not at the top */ + if (exec_nested_level > 0) + return; + + /* + * In a parallel worker, check the parallel context shared buffer to see + * if the leader left a trace context + */ + if (IsParallelWorker()) + { + if (pg_tracing_trace_parallel_workers) + fetch_parallel_context(trace_context); + return; + } + + Assert(trace_context->root_span_index == -1); + Assert(trace_context->trace_id == 0); + + if (params != NULL) + extract_trace_context_from_parameter(trace_context, params); + else if (pstate != NULL) + extract_trace_context_from_query(trace_context, pstate->p_sourcetext, false); + + /* + * A statement can go through this function 3 times: post parsing, planner + * and executor run. If no sampling flag was extracted from SQLCommenter + * and we've already seen this statement (using statement_start ts), don't + * try to apply sample rate and exit. + */ + statement_start_ts = GetCurrentStatementStartTimestamp(); + if (trace_context->sampled == 0 && last_statement_check_for_sampling == statement_start_ts) + return; + /* First time we see this statement, save the time */ + last_statement_check_for_sampling = statement_start_ts; + + trace_context->sampled = apply_sample_rate(trace_context); + + if (trace_context->sampled && check_full_shared_spans()) + /* Buffer is full, abort sampling */ + trace_context->sampled = 0; + + if (trace_context->sampled) + set_trace_id(trace_context); + else + /* No sampling, reset the context */ + reset_trace_context(trace_context); +} + +/* + * Adjust span's offsets with the provided file offset + */ +static void +adjust_file_offset(Span * span, Size file_position) +{ + if (span->node_type_offset != -1) + span->node_type_offset += file_position; + if (span->operation_name_offset != -1) + span->operation_name_offset += file_position; + if (span->deparse_info_offset != -1) + span->deparse_info_offset += file_position; + if (span->parameter_offset != -1) + span->parameter_offset += file_position; +} + +/* + * Reset pg_tracing memory context and global state. + */ +static void +cleanup_tracing(void) +{ + if (!root_trace_context.sampled && !current_trace_context.sampled) + /* No need for cleaning */ + return; + if (pg_tracing_trace_parallel_workers) + remove_parallel_context(); + MemoryContextReset(pg_tracing_mem_ctx); + reset_trace_context(&root_trace_context); + reset_trace_context(¤t_trace_context); + max_nested_level = -1; + previous_parse_nested_level = 0; + nested_query_start_ns = 0; + current_trace_spans = NULL; + per_level_buffers = NULL; +} + +/* + * End the query tracing and dump all spans in the shared buffer. + * This may happen either when query is finished or on a caught error. + */ +static void +end_tracing(void) +{ + Size file_position = 0; + int start_spans_number; + int end_spans_number; + + /* We're still a nested query, tracing is not finished */ + if (exec_nested_level + plan_nested_level > 0) + return; + + /* Dump all buffered texts in file */ + text_store_file(pg_tracing, current_trace_text->data, current_trace_text->len, &file_position); + + for (int i = 0; i < current_trace_spans->end; i++) + adjust_file_offset(current_trace_spans->spans + i, file_position); + + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + start_spans_number = shared_spans->end; + /* We're at the end, add all spans to the shared memory */ + for (int i = 0; i < current_trace_spans->end; i++) + add_span_to_shared_buffer_locked(¤t_trace_spans->spans[i]); + end_spans_number = shared_spans->end; + SpinLockRelease(&s->mutex); + } + + if (pg_tracing_notify_channel != NULL && !IsParallelWorker()) + { + int span_threshold = pg_tracing_max_span * pg_tracing_notify_threshold; + + if (start_spans_number < span_threshold && end_spans_number >= span_threshold) + /* We've crossed the threshold, send a notification */ + Async_Notify(pg_tracing_notify_channel, NULL); + } + + /* We can reset the memory context here */ + cleanup_tracing(); +} + +/* + * When we catch an error (timeout, cancel query), we need to flag the ongoing + * span with an error, send current spans in the shared buffer and clean + * our memory context. + * + * We provide the ongoing span index where the error was caught to attach the + * sql error code to it. + */ +static void +handle_pg_error(const pgTracingTraceContext * trace_context, + int span_index, QueryDesc *queryDesc, + const int64 *end_span_ns) +{ + pgTracingStoredSpan ongoing_span; + pgTracingStoredSpan top_span; + int sql_error_code; + + /* If we're not sampling the query, bail out */ + if (!pg_tracing_enabled(trace_context, exec_nested_level)) + return; + sql_error_code = geterrcode(); + + /* + * Order matters there. We want to process query desc before getting the + * end time. Otherwise, the span nodes will have a higher duration than + * their parents. + */ + if (queryDesc != NULL) + process_query_desc(trace_context, queryDesc, sql_error_code); + + /* Assign the error code to the latest span */ + ongoing_span = get_span_from_index(span_index); + ongoing_span.span->sql_error_code = sql_error_code; + + /* Assign the error code to the latest top span */ + top_span = get_top_span_for_nested_level(trace_context, exec_nested_level); + top_span.span->sql_error_code = sql_error_code; + + /* End all ongoing spans */ + for (int i = 0; i < current_trace_spans->end; i++) + { + ongoing_span = get_span_from_index(i); + if (!ongoing_span.span->ended) + end_span(ongoing_span.span, end_span_ns); + } + + end_tracing(); +} + +/* + * Get the parent id for the given nested level + * + * nested_level can be negative when creating the root top span. + * In this case, we use the parent id from the propagated trace context. + */ +static uint64 +get_parent_id(const pgTracingTraceContext * trace_context, int nested_level) +{ + pgTracingStoredSpan top_span; + + if (nested_level < 0) + return trace_context->parent_id; + Assert(nested_level <= allocated_nested_level); + top_span = get_top_span_for_nested_level(trace_context, nested_level); + Assert(top_span.span->span_id > 0); + return top_span.span->span_id; +} + +/* + * If we enter a new nested level, initialize all necessary buffers + * and start_trace timer. + * + * The start of a top span can vary: prepared statement will skip parsing, the use of cached + * plans will skip the planner hook. + * Thus, a top span can start in either post parse, planner hook or executor run. + */ +static bool +initialize_trace_level(pgTracingTraceContext * trace_context, bool new_root_span) +{ + /* + * Parallel workers will get the referential timers from the leader + * parallel context + */ + if (new_root_span && !IsParallelWorker()) + { + Assert(trace_context->start_trace.ts == 0); + /* Start referential timers */ + trace_context->start_trace.ts = GetCurrentStatementStartTimestamp(); + + /* + * We can't get the monotonic clock at the start of the statement so + * let's estimate it here. + */ + trace_context->start_trace.ns = get_current_ns() - (GetCurrentTimestamp() - trace_context->start_trace.ts) * NS_PER_US; + } + + /* Check if we've already created a top span for this nested level */ + if (exec_nested_level <= max_nested_level) + return false; + + /* First time */ + if (max_nested_level == -1) + { + MemoryContext oldcxt; + + Assert(pg_tracing_mem_ctx->isReset); + + /* + * We need to be able to pass information that depends on the nested + * level. + * + * executor span ids: Since an executor run becomes the parent span, + * we need subsequent created node spans to have the correct parent. + * + * query ids: used to propagate queryId to all spans in the same level + * + * top spans: We create one top span per nested level and those are + * only inserted in the shared buffer at the end + */ + oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx); + /* initial allocation */ + allocated_nested_level = 1; + per_level_buffers = palloc0(allocated_nested_level * sizeof(pgTracingPerLevelBuffer)); + /* 0 is a valid index, initialise all indexes to -1 */ + for (int i = 0; i < allocated_nested_level; i++) + per_level_buffers[i].top_span_index = -1; + current_trace_spans = palloc0(pg_tracing_initial_allocated_spans * sizeof(Span)); + current_trace_spans->max = pg_tracing_initial_allocated_spans; + current_trace_text = makeStringInfo(); + + MemoryContextSwitchTo(oldcxt); + } + else if (exec_nested_level >= allocated_nested_level) + { + /* New nested level, allocate more memory */ + MemoryContext oldcxt; + int old_allocated_nested_level = allocated_nested_level; + + oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx); + allocated_nested_level++; + per_level_buffers = repalloc0(per_level_buffers, old_allocated_nested_level * sizeof(pgTracingPerLevelBuffer), allocated_nested_level * sizeof(pgTracingPerLevelBuffer)); + for (int i = old_allocated_nested_level; i < allocated_nested_level; i++) + per_level_buffers[i].top_span_index = -1; + MemoryContextSwitchTo(oldcxt); + } + + max_nested_level = exec_nested_level; + if (new_root_span) + { + /* Start of a new trace */ + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->stats.traces++; + SpinLockRelease(&s->mutex); + } + return true; +} + +/* + * Add a new string to the current_trace_text stringinfo + */ +static int +add_str_to_trace_buffer(const char *str, int str_len) +{ + int position = current_trace_text->cursor; + + Assert(str_len > 0); + + appendBinaryStringInfo(current_trace_text, str, str_len); + appendStringInfoChar(current_trace_text, '\0'); + current_trace_text->cursor = current_trace_text->len; + return position; +} + +/* + * Add the worker name to the current_trace_text stringinfo + */ +static int +add_worker_name_to_trace_buffer(void) +{ + int position = current_trace_text->cursor; + + appendStringInfo(current_trace_text, "Worker %d", ParallelWorkerNumber); + appendStringInfoChar(current_trace_text, '\0'); + current_trace_text->cursor = current_trace_text->len; + return position; +} + +/* + * Start a new top span if we've entered a new nested level + * or if the previous span at the same level ended + */ +static void +begin_top_span(const pgTracingTraceContext * trace_context, + Span * top_span, CmdType commandType, + const Query *query, const JumbleState *jstate, + const PlannedStmt *pstmt, + const char *query_text, int64 start_time_ns) +{ + int query_len; + const char *normalised_query; + + /* in case of a cached plan, query might be unavailable */ + if (query != NULL) + per_level_buffers[exec_nested_level].query_id = query->queryId; + else if (trace_context->query_id > 0) + per_level_buffers[exec_nested_level].query_id = trace_context->query_id; + + begin_span(trace_context, top_span, command_type_to_span_type(commandType), + get_parent_id(trace_context, exec_nested_level - 1), + per_level_buffers[exec_nested_level].query_id, &start_time_ns); + + if (IsParallelWorker()) + { + /* + * In a parallel worker, we use the worker name as the span's + * operation + */ + top_span->operation_name_offset = add_worker_name_to_trace_buffer(); + return; + } + + if (jstate && jstate->clocations_count > 0 && query != NULL) + { + /* jstate is available, normalise query and extract parameters' values */ + char *param_str; + int param_len; + + query_len = query->stmt_len; + normalised_query = normalise_query_parameters(jstate, query_text, + query->stmt_location, + &query_len, ¶m_str, ¶m_len); + Assert(param_len > 0); + if (pg_tracing_export_parameters) + top_span->parameter_offset = add_str_to_trace_buffer(param_str, param_len); + } + else + { + /* + * No jstate available, normalise query but we won't be able to + * extract parameters + */ + int stmt_location; + + if (query != NULL && query->stmt_len > 0) + { + query_len = query->stmt_len; + stmt_location = query->stmt_location; + } + else if (pstmt != NULL && pstmt->stmt_location != -1 && pstmt->stmt_len > 0) + { + query_len = pstmt->stmt_len; + stmt_location = pstmt->stmt_location; + } + else + { + query_len = strlen(query_text); + stmt_location = 0; + } + normalised_query = normalise_query(query_text, stmt_location, &query_len); + } + top_span->operation_name_offset = add_str_to_trace_buffer(normalised_query, query_len); +} + +/* + * Initialise buffers if we are in a new nested level and start associated top span. + * If the top span already exists for the current nested level, this has no effect. + * + * This needs to be called every time a top span could be started: post parse, planner, executor start and process utility + */ +static void +initialize_trace_level_and_top_span(pgTracingTraceContext * trace_context, CmdType commandType, + Query *query, JumbleState *jstate, const PlannedStmt *pstmt, + const char *query_text, int64 start_time_ns) +{ + bool new_allocated_level; + pgTracingStoredSpan top_span; + bool new_root_span = trace_context->root_span_index == -1; + + new_allocated_level = initialize_trace_level(trace_context, new_root_span); + + if (!new_allocated_level && !new_root_span) + { + /* top span must exists, get it */ + top_span = get_top_span_for_nested_level(trace_context, exec_nested_level); + + /* + * It's possible to have multiple top spans within a nested query. If + * the previous top span was closed, we need to start a new one + */ + if (!top_span.span->ended) + return; + } + + /* current_trace_spans buffer should have been allocated */ + Assert(current_trace_spans != NULL); + + /* We need to create a new top span */ + top_span = get_new_stored_span(); + begin_top_span(trace_context, top_span.span, commandType, query, jstate, pstmt, query_text, start_time_ns); + set_top_span_for_nested_level(trace_context, exec_nested_level, top_span.index); + + return; +} + +/* + * Post-parse-analyze hook + * + * Tracing can be started here if: + * - The query has a SQLCommenter with traceparent parameter with sampled flag on and passes the caller_sample_rate + * - The query passes the sample_rate + * + * If query is sampled, create a post parse span and start the top span. + */ +static void +pg_tracing_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate) +{ + int64 start_post_parse_ns; + uint64 parent_id; + pgTracingTraceContext *trace_context = &root_trace_context; + bool new_lxid = MyProc->lxid != latest_lxid; + + if (!pg_tracing_mem_ctx->isReset && new_lxid && exec_nested_level + plan_nested_level == 0) + + /* + * Some errors can happen outside of our PG_TRY (incorrect number of + * bind parameters for example) which will leave a dirty trace + * context. We can also have sampled individual Parse command through + * extended protocol. In this case, we just drop the spans and reset + * memory context + */ + cleanup_tracing(); + + if (exec_nested_level + plan_nested_level == 0) + + /* + * At the root level, clean any leftover state of previous trace + * context + */ + reset_trace_context(&root_trace_context); + else + /* We're in a nested query, grab the ongoing trace_context */ + trace_context = ¤t_trace_context; + + if (prev_post_parse_analyze_hook) + prev_post_parse_analyze_hook(pstate, query, jstate); + + /* If disabled, don't trace utility statement */ + if (query->utilityStmt && !pg_tracing_track_utility) + return; + + /* Evaluate if query is sampled or not */ + extract_trace_context(trace_context, pstate, NULL); + + if (!trace_context->sampled) + /* Query is not sampled, nothing to do */ + return; + + /* + * We want to avoid calling get_ns at the start of post parse as it will + * impact all queries and we will only use it when the query is sampled. + */ + start_post_parse_ns = get_current_ns(); + + /* + * Either we're inside a nested sampled query or we've parsed a query with + * the sampled flag, start a new level with a top span + */ + initialize_trace_level_and_top_span(trace_context, query->commandType, query, + jstate, NULL, pstate->p_sourcetext, start_post_parse_ns); + + parent_id = get_parent_id(trace_context, exec_nested_level); + + /* + * We only create parse span when we have a good idea of when parsing + * started. Since multiple queries can be parsed at once when entering a + * new nested level, we only create a parse span when we detect a change + * in parse nested level. + */ + if ((exec_nested_level + plan_nested_level == 0) || (nested_query_start_ns > 0 && previous_parse_nested_level != exec_nested_level)) + { + int64 start_parse_span; + int parse_span_index; + + /* + * We don't have a precise time for parse end, estimate it + * + * TODO: Can we get a more precise start of parse? + */ + int64 end_parse_ns = start_post_parse_ns; + + if (nested_query_start_ns > 0 && (exec_nested_level + plan_nested_level) > 0) + start_parse_span = nested_query_start_ns; + else + start_parse_span = trace_context->start_trace.ns; + + parse_span_index = create_full_span(trace_context, SPAN_PARSE, + get_parent_id(trace_context, exec_nested_level - 1), + start_parse_span, &end_parse_ns); + per_level_buffers[exec_nested_level].parse_span_index = parse_span_index; + } + + /* Post parse span */ + create_full_span(trace_context, SPAN_POST_PARSE, + parent_id, start_post_parse_ns, NULL); + + /* keep track of the previous parse level */ + previous_parse_nested_level = exec_nested_level; +} + +/* + * Planner hook + * Tracing can start here if the query skipped parsing (prepared statement) and passes the random sample_rate. + * If query is sampled, create a plan span and start the top span if not already started. + */ +static PlannedStmt * +pg_tracing_planner_hook(Query *query, + const char *query_string, + int cursorOptions, + ParamListInfo params) +{ + PlannedStmt *result; + pgTracingStoredSpan span_planner; + pgTracingTraceContext *trace_context = &root_trace_context; + int64 start_span_ns; + + if (exec_nested_level > 0) + /* We're in a nested query, grab the ongoing trace_context */ + trace_context = ¤t_trace_context; + + /* Evaluate if query is sampled or not */ + extract_trace_context(trace_context, NULL, params); + + if (!pg_tracing_enabled(trace_context, plan_nested_level + exec_nested_level)) + { + /* No sampling */ + if (prev_planner_hook) + result = prev_planner_hook(query, query_string, cursorOptions, + params); + else + result = standard_planner(query, query_string, cursorOptions, + params); + return result; + } + + start_span_ns = get_current_ns(); + + /* + * We may have skipped parsing if statement was prepared, create a new top + * span in this case. + */ + initialize_trace_level_and_top_span(trace_context, query->commandType, query, + NULL, NULL, query_string, start_span_ns); + + /* Start planner span */ + span_planner = get_new_stored_span(); + begin_span(trace_context, span_planner.span, SPAN_PLANNER, + get_parent_id(trace_context, exec_nested_level), + per_level_buffers[exec_nested_level].query_id, &start_span_ns); + + plan_nested_level++; + PG_TRY(); + { + if (prev_planner_hook) + result = prev_planner_hook(query, query_string, cursorOptions, + params); + else + result = standard_planner(query, query_string, cursorOptions, + params); + } + PG_CATCH(); + { + plan_nested_level--; + handle_pg_error(trace_context, span_planner.index, NULL, NULL); + PG_RE_THROW(); + } + PG_END_TRY(); + plan_nested_level--; + + /* End planner span */ + end_span_index(span_planner.index, NULL); + + /* If we have a prepared statement, add bound parameters to the top span */ + if (params != NULL && pg_tracing_export_parameters) + { + char *paramStr = BuildParamLogString(params, + NULL, pg_tracing_max_parameter_str); + pgTracingStoredSpan current_top = get_top_span_for_nested_level(trace_context, exec_nested_level); + + Assert(current_top.span->parameter_offset == -1); + if (paramStr != NULL) + current_top.span->parameter_offset = add_str_to_trace_buffer(paramStr, strlen(paramStr)); + } + return result; +} + +/* + * ExecutorStart hook: Activate query instrumentation if query is sampled + * Tracing can be started here if the query used a cached plan and passes the random sample_rate. + * If query is sampled, start the top span if it doesn't already exist. + */ +static void +pg_tracing_ExecutorStart(QueryDesc *queryDesc, int eflags) +{ + int instrument_options; + pgTracingTraceContext *trace_context = ¤t_trace_context; + int64 start_span_ns = -1; + bool is_lazy_function; + + if (exec_nested_level + plan_nested_level == 0) + + /* + * We're at the root level, copy eventual trace context from + * parsing/planning + */ + *trace_context = root_trace_context; + + /* Evaluate if query is sampled or not */ + extract_trace_context(trace_context, NULL, queryDesc->params); + + /* + * We can detect the presence of lazy function through the node tag and + * the executor flags. Lazy function will go through an ExecutorRun with + * every call, possibly generating thousands of spans which is not + * manageable and not very useful. If lazy functions are detected, we + * don't instrument this node and no spans will be generated. + */ + is_lazy_function = nodeTag(queryDesc->plannedstmt->planTree) == T_FunctionScan && eflags == EXEC_FLAG_SKIP_TRIGGERS; + if (pg_tracing_enabled(trace_context, exec_nested_level) && !is_lazy_function) + { + start_span_ns = get_current_ns(); + + if (trace_context->query_id != 0) + + /* + * TODO: Can this fail? Should we drop tracing if we detect + * mismatch? + */ + Assert(trace_context->query_id == queryDesc->plannedstmt->queryId); + + /* + * In case of a cached plan, we haven't gone through neither parsing + * nor planner hook. Parallel workers will also start tracing from + * there. Create the top case in this case. + */ + initialize_trace_level_and_top_span(trace_context, queryDesc->operation, + NULL, NULL, NULL, + queryDesc->sourceText, start_span_ns); + Assert(trace_context->root_span_index != -1); + + /* + * Activate query instrumentation. Timer and rows instrumentation are + * mandatory. + */ + instrument_options = INSTRUMENT_TIMER | INSTRUMENT_ROWS; + if (pg_tracing_instrument_buffers) + instrument_options |= INSTRUMENT_BUFFERS; + if (pg_tracing_instrument_wal) + instrument_options |= INSTRUMENT_WAL; + queryDesc->instrument_options = instrument_options; + } + + if (prev_ExecutorStart) + prev_ExecutorStart(queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); + + if (start_span_ns != -1) + { + /* Allocate totaltime instrumentation in the per-query context */ + if (queryDesc->totaltime == NULL) + { + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); + queryDesc->totaltime = InstrAlloc(1, instrument_options, false); + MemoryContextSwitchTo(oldcxt); + } + + /* Executor span */ + create_full_span(trace_context, SPAN_EXECUTOR_START, + get_parent_id(trace_context, exec_nested_level), + start_span_ns, NULL); + } +} + +/* + * ExecutorRun hook: track nesting depth and create executor run span. + * If the plan needs to create parallel workers, push the trace context in the parallel shared buffer. + */ +static void +pg_tracing_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, + uint64 count, bool execute_once) +{ + const pgTracingTraceContext *trace_context = ¤t_trace_context; + bool query_sampled = pg_tracing_enabled(trace_context, exec_nested_level); + int64 start_span_ns; + int64 end_span_ns; + pgTracingStoredSpan executor_run_span; + + executor_run_span.index = -1; + + if (query_sampled) + { + start_span_ns = get_current_ns(); + nested_query_start_ns = start_span_ns; + } + + if (query_sampled && queryDesc->instrument_options > 0) + { + Assert(trace_context->root_span_index != -1); + /* Start executor run span */ + executor_run_span = get_new_stored_span(); + begin_span(trace_context, executor_run_span.span, SPAN_EXECUTOR_RUN, + get_parent_id(trace_context, exec_nested_level), + per_level_buffers[exec_nested_level].query_id, + &start_span_ns); + + /* + * Executor run is used as the parent's span for nodes, store its' id + * in the per level buffer. + */ + per_level_buffers[exec_nested_level].executor_run_id = executor_run_span.span->span_id; + + /* + * If this query starts parallel worker, push the trace context for + * the child processes + */ + if (queryDesc->plannedstmt->parallelModeNeeded && pg_tracing_trace_parallel_workers) + add_parallel_context(trace_context, executor_run_span.span->span_id, + per_level_buffers[exec_nested_level].query_id); + } + + exec_nested_level++; + PG_TRY(); + { + if (prev_ExecutorRun) + prev_ExecutorRun(queryDesc, direction, count, execute_once); + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + } + PG_CATCH(); + { + end_span_ns = get_current_ns(); + end_top_span(trace_context, &end_span_ns); + exec_nested_level--; + + /* + * When a query is executing the pg_tracing_{peek,consume}_spans + * function, sampling will be disabled while we have an ongoing trace + * and current_trace_spans will be NULL. Checking that span_index != + * -1 isn't enough in this case, we need to check that + * current_trace_spans still exists. + */ + if (current_trace_spans != NULL && executor_run_span.index != -1) + handle_pg_error(trace_context, executor_run_span.index, queryDesc, &end_span_ns); + PG_RE_THROW(); + } + PG_END_TRY(); + end_span_ns = get_current_ns(); + end_top_span(trace_context, &end_span_ns); + exec_nested_level--; + + /* + * Same as above, tracing could have been aborted, check for + * current_trace_spans + */ + if (current_trace_spans != NULL && executor_run_span.index != -1) + end_span_index(executor_run_span.index, &end_span_ns); +} + +/* + * ExecutorFinish hook: create executor finish span and track nesting depth + * ExecutorFinish can start nested queries through triggers so we need + * to set the ExecutorFinish span as the top span + */ +static void +pg_tracing_ExecutorFinish(QueryDesc *queryDesc) +{ + int previous_top_span_index = -1; + pgTracingStoredSpan executor_finish_span; + pgTracingTraceContext *trace_context = ¤t_trace_context; + + executor_finish_span.index = -1; + + if (pg_tracing_enabled(trace_context, exec_nested_level) && queryDesc->instrument_options > 0) + { + int64 start_span_ns = get_current_ns(); + + Assert(trace_context->root_span_index != -1); + + executor_finish_span = get_new_stored_span(); + previous_top_span_index = get_top_span_for_nested_level(trace_context, exec_nested_level).index; + begin_span(trace_context, executor_finish_span.span, SPAN_EXECUTOR_FINISH, + get_parent_id(trace_context, exec_nested_level), + per_level_buffers[exec_nested_level].query_id, &start_span_ns); + + /* + * ExecutorFinish may create nested queries when executing triggers. + * We set ExecutorFinish span as the top queries for this. + */ + set_top_span_for_nested_level(trace_context, exec_nested_level, executor_finish_span.index); + /* Set a possible start for nested query */ + nested_query_start_ns = start_span_ns; + } + + exec_nested_level++; + PG_TRY(); + { + if (prev_ExecutorFinish) + prev_ExecutorFinish(queryDesc); + else + standard_ExecutorFinish(queryDesc); + } + PG_CATCH(); + { + exec_nested_level--; + if (executor_finish_span.index != -1) + handle_pg_error(trace_context, executor_finish_span.index, queryDesc, NULL); + PG_RE_THROW(); + } + PG_END_TRY(); + exec_nested_level--; + + if (executor_finish_span.index != -1) + { + /* Restore previous top span */ + set_top_span_for_nested_level(trace_context, exec_nested_level, previous_top_span_index); + end_span_index(executor_finish_span.index, NULL); + } +} + +/* + * ExecutorEnd hook will: + * - process queryDesc to generate span from planstate + * - end top span for the current nested level + * - end tracing if we're at the root nested level + */ +static void +pg_tracing_ExecutorEnd(QueryDesc *queryDesc) +{ + int64 start_span_ns = -1; + pgTracingTraceContext *trace_context = ¤t_trace_context; + + if (pg_tracing_enabled(trace_context, exec_nested_level) && queryDesc->instrument_options > 0) + { + start_span_ns = get_current_ns(); + Assert(trace_context->root_span_index != -1); + process_query_desc(trace_context, queryDesc, 0); + } + + exec_nested_level++; + if (prev_ExecutorEnd) + prev_ExecutorEnd(queryDesc); + else + standard_ExecutorEnd(queryDesc); + exec_nested_level--; + + if (start_span_ns != -1) + { + int64 end_span_ns = get_current_ns(); + bool overlapping_trace_context; + + /* + * We may go through multiple statement nested in a single + * ExecutorFinish (multiple triggers) so we need to update the + * nested_start. + */ + nested_query_start_ns = end_span_ns; + + /* Create executor_end span and end top span */ + end_top_span(trace_context, &end_span_ns); + create_full_span(trace_context, SPAN_EXECUTOR_END, + get_parent_id(trace_context, exec_nested_level), + start_span_ns, &end_span_ns); + + /* + * if root trace context has a different root span index, it means + * that we may have started to trace the next statement while still + * processing the current one. This may happen with explicit + * transaction using extended protocol, the parsing of the next + * statement happens before the ExecutorEnd of the current statement. + * In this case, we can't end tracing as we still have unfinished + * spans. + */ + overlapping_trace_context = current_trace_context.root_span_index != root_trace_context.root_span_index && root_trace_context.sampled; + if (!overlapping_trace_context) + end_tracing(); + } +} + +/* + * ProcessUtility hook + * + * Trace utility query if utility tracking is enabled and sampling was enabled + * during parse step. + * Process utility may create nested queries (for example function CALL) so we need + * to set the ProcessUtility span as the top span before going through the standard + * codepath. + */ +static void +pg_tracing_ProcessUtility(PlannedStmt *pstmt, const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, QueryEnvironment *queryEnv, + DestReceiver *dest, QueryCompletion *qc) +{ + pgTracingStoredSpan process_utility_span; + pgTracingStoredSpan previous_top_span; + int64 end_span_ns; + int64 start_span_ns; + pgTracingTraceContext *trace_context = ¤t_trace_context; + + if (exec_nested_level + plan_nested_level == 0) + /* We're at root level, use the root trace_context */ + *trace_context = root_trace_context; + + if (!pg_tracing_track_utility || !pg_tracing_enabled(trace_context, exec_nested_level)) + { + /* No sampling, just go through the standard process utility */ + if (prev_ProcessUtility) + prev_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + else + standard_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + + /* + * Tracing may have been started within the utility call without going + * through ExecutorEnd (ex: prepare statement). Check and end tracing + * here in this case. + */ + if (!pg_tracing_mem_ctx->isReset) + { + Assert(current_trace_context.sampled || root_trace_context.sampled); + Assert(exec_nested_level == 0); + end_tracing(); + } + return; + } + + /* Statement is sampled */ + start_span_ns = get_current_ns(); + + /* + * Set a possible start for nested query. ProcessUtility may start a + * nested query and we can't use the statement start like we usually do. + * We artificially start nested 1us later to avoid overlapping spans. + */ + nested_query_start_ns = start_span_ns; + + initialize_trace_level_and_top_span(trace_context, pstmt->commandType, NULL, NULL, + pstmt, queryString, start_span_ns); + + /* Build the process utility span. */ + process_utility_span = get_new_stored_span(); + begin_span(trace_context, process_utility_span.span, SPAN_PROCESS_UTILITY, + get_parent_id(trace_context, exec_nested_level), + per_level_buffers[exec_nested_level].query_id, + &start_span_ns); + + /* + * Set ProcessUtility span as the new top span for the nested queries that + * may be created. We need to keep the old top span index to end it at the + * end of the process utility hook. + */ + previous_top_span = get_top_span_for_nested_level(trace_context, exec_nested_level); + set_top_span_for_nested_level(trace_context, exec_nested_level, process_utility_span.index); + + exec_nested_level++; + PG_TRY(); + { + if (prev_ProcessUtility) + prev_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + else + standard_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + } + PG_CATCH(); + { + end_span_ns = get_current_ns(); + exec_nested_level--; + end_top_span(trace_context, &end_span_ns); + handle_pg_error(trace_context, previous_top_span.index, NULL, &end_span_ns); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* ProcessUtility may have created a top span, end it here */ + end_span_ns = get_current_ns(); + end_top_span(trace_context, &end_span_ns); + exec_nested_level--; + + /* + * current_trace_spans may have been repalloced, refresh the + * process_utility_span span + */ + process_utility_span = get_span_from_index(process_utility_span.index); + if (qc != NULL) + process_utility_span.span->node_counters.rows = qc->nprocessed; + /* ProcessUtility is the current top span, no need to call end_top_span */ + end_span(process_utility_span.span, &end_span_ns); + + /* Also end the previous top span */ + previous_top_span = get_span_from_index(previous_top_span.index); + end_span(previous_top_span.span, &end_span_ns); + + Assert(previous_top_span.span->start <= process_utility_span.span->start); + Assert(get_span_end_ns(previous_top_span.span) >= get_span_end_ns(process_utility_span.span)); + + end_tracing(); + + /* + * We may iterate through a list of nested ProcessUtility calls, update + * the nested query start. + */ + nested_query_start_ns = end_span_ns; +} + +/* + * Iterate over a list of planstate to generate span node + */ +static void +generate_member_nodes(PlanState **planstates, int nplans, planstateTraceContext * planstateTraceContext, uint64 parent_id) +{ + int j; + + for (j = 0; j < nplans; j++) + generate_span_from_planstate(planstates[j], planstateTraceContext, parent_id); +} + +/* + * Iterate over custom scan planstates to generate span node + */ +static void +generate_span_from_custom_scan(CustomScanState *css, planstateTraceContext * planstateTraceContext, uint64 parent_id) +{ + ListCell *cell; + + foreach(cell, css->custom_ps) + generate_span_from_planstate((PlanState *) lfirst(cell), planstateTraceContext, parent_id); +} + +/* + * With nested queries, the post parse hook will be called before we know + * the top span parent. We will only know and the parent when we generate span nodes from planstate. + * + * When those nodes are created, we attach the top span. + */ +static void +attach_top_span_to_new_parent(const pgTracingTraceContext * trace_context, Span const *new_parent, int64 parent_end_ns_time) +{ + pgTracingStoredSpan child_top_span; + int64 child_end_ns_time; + + if (exec_nested_level + 1 > max_nested_level) + /* No child to attach */ + return; + + child_top_span = get_top_span_for_nested_level(trace_context, exec_nested_level + 1); + Assert(child_top_span.span->start_ns_time > 0); + /* If child start after parent's end, don't link it */ + if (parent_end_ns_time < child_top_span.span->start_ns_time) + return; + child_end_ns_time = child_top_span.span->start_ns_time + child_top_span.span->duration_ns; + /* If child ends before parent's start, don't link it */ + if (child_end_ns_time < new_parent->start_ns_time) + return; + + /* Link this child to the new parent */ + child_top_span.span->parent_id = new_parent->span_id; +} + +/* + * Attach an already existing parse span to a new parent. + * + * Parse span are created in the post_parse hook but we don't know + * its parent nor its exact start, we only have a good idea of its end. + * + * When processing the queryDesc, we can find possible parent candidate and + * attach the parse span. We need to adjust the parse span start to match + * the new parent. + * + * The parent node may have another child node (ProjectSet will have a Result + * outerplan node. The parse span will need to start after this child node. + */ +static void +attach_parse_span_to_new_parent(const pgTracingTraceContext * trace_context, const Span * new_parent, + int64 new_start_ns, int64 parent_end_ns_time) +{ + int child_parse_index; + pgTracingStoredSpan child_parse_span; + int64 end_ns_time; + + Assert(new_start_ns < parent_end_ns_time); + if (exec_nested_level + 1 > max_nested_level) + /* No child to attach */ + return; + + /* Get child parse span */ + child_parse_index = per_level_buffers[exec_nested_level + 1].parse_span_index; + if (child_parse_index == -1) + return; + child_parse_span = get_span_from_index(child_parse_index); + + /* If child start after parent's end, don't link it */ + if (child_parse_span.span->start_ns_time > parent_end_ns_time) + return; + + end_ns_time = child_parse_span.span->start_ns_time + child_parse_span.span->duration_ns; + /* If child ends before parent's start, don't link it */ + if (end_ns_time < new_parent->start_ns_time) + return; + + /* Check if child ends after parent's end */ + if (end_ns_time > parent_end_ns_time) + return; + + /* Check if child ends before new start */ + if (end_ns_time < new_start_ns) + return; + + /* Adjust span start */ + set_span_start(trace_context, child_parse_span.span, new_start_ns); + + /* Adjust span duration */ + child_parse_span.span->duration_ns = end_ns_time - child_parse_span.span->start_ns_time; + Assert(child_parse_span.span->duration_ns > 0); + + /* Child span should already have ended */ + Assert(child_parse_span.span->ended); + /* Link this child to the new parent */ + child_parse_span.span->parent_id = new_parent->span_id; +} + + +/* + * Create span node for the provided planstate + */ +static Span * +create_span_node(PlanState *planstate, planstateTraceContext * planstateTraceContext, + uint64 parent_id, SpanType span_type, char *subplan_name) +{ + pgTracingStoredSpan stored_span; + Span *span; + Plan const *plan = planstate->plan; + int64 span_end_ns; + int64 start_span_ns; + + /* + * Make sure stats accumulation is done. Function is a no-op if if was + * already done. + */ + InstrEndLoop(planstate->instrument); + + /* We only create span node on node that were executed */ + Assert(!INSTR_TIME_IS_ZERO(planstate->instrument->firsttime)); + + start_span_ns = INSTR_TIME_GET_NANOSEC(planstate->instrument->firsttime); + stored_span = get_new_stored_span(); + span = stored_span.span; + + begin_span(planstateTraceContext->trace_context, span, span_type, + parent_id, per_level_buffers[exec_nested_level].query_id, + &start_span_ns); + + /* first tuple time */ + span->startup = planstate->instrument->startup * NS_PER_S; + + if (span_type == SPAN_NODE) + { + /* Generate node specific variable strings and store them */ + const char *node_type; + const char *deparse_info; + char const *operation_name; + int deparse_info_len; + + node_type = plan_to_node_type(plan); + span->node_type_offset = add_str_to_trace_buffer(node_type, strlen(node_type)); + + operation_name = plan_to_operation(planstateTraceContext, planstate, node_type); + span->operation_name_offset = add_str_to_trace_buffer(operation_name, strlen(operation_name)); + + /* deparse_ctx us NULL if deparsing was disabled */ + if (planstateTraceContext->deparse_ctx != NULL) + { + deparse_info = plan_to_deparse_info(planstateTraceContext, planstate); + deparse_info_len = strlen(deparse_info); + if (deparse_info_len > 0) + span->deparse_info_offset = add_str_to_trace_buffer(deparse_info, deparse_info_len); + } + } + else if (subplan_name != NULL) + span->operation_name_offset = add_str_to_trace_buffer(subplan_name, strlen(subplan_name)); + + span->node_counters.rows = (int64) planstate->instrument->ntuples / planstate->instrument->nloops; + span->node_counters.nloops = (int64) planstate->instrument->nloops; + span->node_counters.buffer_usage = planstate->instrument->bufusage; + span->node_counters.wal_usage = planstate->instrument->walusage; + + span->plan_counters.startup_cost = plan->startup_cost; + span->plan_counters.total_cost = plan->total_cost; + span->plan_counters.plan_rows = plan->plan_rows; + span->plan_counters.plan_width = plan->plan_width; + + if (!planstate->state->es_finished) + { + /* + * If the query is in an unfinished state, it means that we're in an + * error handler. Stop the node instrumentation to get the latest + * known state. + */ + if (!INSTR_TIME_IS_ZERO(planstate->instrument->starttime)) + /* Don't stop the node if it wasn't started */ + InstrStopNode(planstate->instrument, planstate->state->es_processed); + InstrEndLoop(planstate->instrument); + span->sql_error_code = planstateTraceContext->sql_error_code; + } + span_end_ns = INSTR_TIME_GET_NANOSEC(planstate->instrument->firsttime); + Assert(planstate->instrument->total > 0); + span_end_ns += planstate->instrument->total * NS_PER_S; + end_span(span, &span_end_ns); + + return span; +} + +/* + * Walk through the planstate tree generating a node span for each node. + * We pass possible error code to tag unfinished node with it + */ +static Span * +generate_span_from_planstate(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id) +{ + ListCell *l; + Plan const *plan = planstate->plan; + Span *outer_span = NULL; + Span *span; + + /* The node was never executed, skip it */ + if (planstate->instrument == NULL || INSTR_TIME_IS_ZERO(planstate->instrument->firsttime)) + return NULL; + + span = create_span_node(planstate, planstateTraceContext, parent_id, SPAN_NODE, NULL); + planstateTraceContext->ancestors = lcons(planstate->plan, planstateTraceContext->ancestors); + + /* Walk the outerplan */ + if (outerPlanState(planstate)) + outer_span = generate_span_from_planstate(outerPlanState(planstate), planstateTraceContext, span->span_id); + /* Walk the innerplan */ + if (innerPlanState(planstate)) + generate_span_from_planstate(innerPlanState(planstate), planstateTraceContext, span->span_id); + + /* Handle init plans */ + foreach(l, planstate->initPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(l); + PlanState *splan = sstate->planstate; + Span *init_span; + + if (INSTR_TIME_IS_ZERO(splan->instrument->firsttime)) + continue; + init_span = create_span_node(splan, planstateTraceContext, parent_id, SPAN_NODE_INIT_PLAN, sstate->subplan->plan_name); + generate_span_from_planstate(splan, planstateTraceContext, init_span->span_id); + } + + /* Handle sub plans */ + foreach(l, planstate->subPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(l); + PlanState *splan = sstate->planstate; + Span *subplan_span; + + if (INSTR_TIME_IS_ZERO(splan->instrument->firsttime)) + continue; + subplan_span = create_span_node(splan, planstateTraceContext, parent_id, SPAN_NODE_SUBPLAN, sstate->subplan->plan_name); + generate_span_from_planstate(splan, planstateTraceContext, subplan_span->span_id); + } + + /* Handle special nodes with children nodes */ + switch (nodeTag(plan)) + { + case T_Append: + generate_member_nodes(((AppendState *) planstate)->appendplans, + ((AppendState *) planstate)->as_nplans, planstateTraceContext, span->span_id); + break; + case T_MergeAppend: + generate_member_nodes(((MergeAppendState *) planstate)->mergeplans, + ((MergeAppendState *) planstate)->ms_nplans, planstateTraceContext, span->span_id); + break; + case T_BitmapAnd: + generate_member_nodes(((BitmapAndState *) planstate)->bitmapplans, + ((BitmapAndState *) planstate)->nplans, planstateTraceContext, span->span_id); + break; + case T_BitmapOr: + generate_member_nodes(((BitmapOrState *) planstate)->bitmapplans, + ((BitmapOrState *) planstate)->nplans, planstateTraceContext, span->span_id); + break; + case T_SubqueryScan: + generate_span_from_planstate(((SubqueryScanState *) planstate)->subplan, planstateTraceContext, span->span_id); + break; + case T_CustomScan: + generate_span_from_custom_scan((CustomScanState *) planstate, planstateTraceContext, span->span_id); + break; + default: + break; + } + planstateTraceContext->ancestors = list_delete_first(planstateTraceContext->ancestors); + + /* + * If we have a Result or ProjectSet node, make it the span parent of the + * next query span if we have any + * + * TODO: Do we have other node that could be the base for nested queries? + */ + if (nodeTag(plan) == T_Result || nodeTag(plan) == T_ProjectSet) + { + int64 span_end_ns = span->start_ns_time + span->duration_ns; + + attach_top_span_to_new_parent(planstateTraceContext->trace_context, span, span_end_ns); + if (nodeTag(plan) == T_ProjectSet) + { + /* + * A parse may happen within a projectet during function exec. Try + * to attach existing child parse to the project set node. + */ + int64 outer_span_end_ns; + + /* We should have an outer span with a ProjectSet */ + Assert(outer_span != NULL); + outer_span_end_ns = outer_span->start_ns_time + outer_span->duration_ns; + attach_parse_span_to_new_parent(planstateTraceContext->trace_context, span, outer_span_end_ns, span_end_ns); + } + else + attach_parse_span_to_new_parent(planstateTraceContext->trace_context, span, span->start_ns_time, span_end_ns); + } + + return span; +} + +/* + * Add plan counters to the Datum output + */ +static int +add_plan_counters(const PlanCounters * plan_counters, int i, Datum *values) +{ + values[i++] = Float8GetDatumFast(plan_counters->startup_cost); + values[i++] = Float8GetDatumFast(plan_counters->total_cost); + values[i++] = Float8GetDatumFast(plan_counters->plan_rows); + values[i++] = Int32GetDatum(plan_counters->plan_width); + return i; +} + +/* + * Add node counters to the Datum output + */ +static int +add_node_counters(const NodeCounters * node_counters, int i, Datum *values) +{ + Datum wal_bytes; + char buf[256]; + double blk_read_time, + blk_write_time, + temp_blk_read_time, + temp_blk_write_time; + double generation_counter, + inlining_counter, + optimization_counter, + emission_counter; + int64 jit_created_functions; + + values[i++] = Int64GetDatumFast(node_counters->rows); + values[i++] = Int64GetDatumFast(node_counters->nloops); + + /* Buffer usage */ + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_hit); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_read); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_dirtied); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_written); + + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_hit); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_read); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_dirtied); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_written); + + blk_read_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.shared_blk_read_time); + blk_write_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.shared_blk_write_time); + temp_blk_read_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.temp_blk_read_time); + temp_blk_write_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.temp_blk_write_time); + + values[i++] = Float8GetDatumFast(blk_read_time); + values[i++] = Float8GetDatumFast(blk_write_time); + values[i++] = Float8GetDatumFast(temp_blk_read_time); + values[i++] = Float8GetDatumFast(temp_blk_write_time); + + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.temp_blks_read); + values[i++] = Int64GetDatumFast(node_counters->buffer_usage.temp_blks_written); + + /* WAL usage */ + values[i++] = Int64GetDatumFast(node_counters->wal_usage.wal_records); + values[i++] = Int64GetDatumFast(node_counters->wal_usage.wal_fpi); + snprintf(buf, sizeof buf, UINT64_FORMAT, node_counters->wal_usage.wal_bytes); + + /* Convert to numeric. */ + wal_bytes = DirectFunctionCall3(numeric_in, + CStringGetDatum(buf), + ObjectIdGetDatum(0), + Int32GetDatum(-1)); + values[i++] = wal_bytes; + + /* JIT usage */ + generation_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.generation_counter); + inlining_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.inlining_counter); + optimization_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.optimization_counter); + emission_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.emission_counter); + jit_created_functions = node_counters->jit_usage.created_functions; + + values[i++] = Int64GetDatumFast(jit_created_functions); + values[i++] = Float8GetDatumFast(generation_counter); + values[i++] = Float8GetDatumFast(inlining_counter); + values[i++] = Float8GetDatumFast(optimization_counter); + values[i++] = Float8GetDatumFast(emission_counter); + + return i; +} + +/* + * Build the tuple for a Span and add it to the output + */ +static void +add_result_span(ReturnSetInfo *rsinfo, Span * span, + const char *qbuffer, Size qbuffer_size) +{ +#define PG_TRACING_TRACES_COLS 45 + Datum values[PG_TRACING_TRACES_COLS] = {0}; + bool nulls[PG_TRACING_TRACES_COLS] = {0}; + const char *span_type; + const char *operation_name; + const char *sql_error_code; + int i = 0; + + span_type = get_span_name(span, qbuffer, qbuffer_size); + operation_name = get_operation_name(span, qbuffer, qbuffer_size); + sql_error_code = unpack_sql_state(span->sql_error_code); + + Assert(span_type != NULL); + Assert(operation_name != NULL); + Assert(sql_error_code != NULL); + + values[i++] = UInt64GetDatum(span->trace_id); + values[i++] = UInt64GetDatum(span->parent_id); + values[i++] = UInt64GetDatum(span->span_id); + values[i++] = UInt64GetDatum(span->query_id); + values[i++] = CStringGetTextDatum(span_type); + values[i++] = CStringGetTextDatum(operation_name); + values[i++] = Int64GetDatumFast(span->start); + values[i++] = Int16GetDatum(span->start_ns); + + values[i++] = Int64GetDatumFast(span->duration_ns); + values[i++] = CStringGetTextDatum(sql_error_code); + values[i++] = UInt32GetDatum(span->be_pid); + values[i++] = ObjectIdGetDatum(span->user_id); + values[i++] = ObjectIdGetDatum(span->database_id); + values[i++] = UInt8GetDatum(span->subxact_count); + + if ((span->type >= SPAN_NODE && span->type <= SPAN_NODE_UNKNOWN) + || span->type == SPAN_PLANNER) + { + i = add_plan_counters(&span->plan_counters, i, values); + i = add_node_counters(&span->node_counters, i, values); + + values[i++] = Int64GetDatumFast(span->startup); + if (span->parameter_offset != -1 && qbuffer_size > 0 && qbuffer_size > span->parameter_offset) + values[i++] = CStringGetTextDatum(qbuffer + span->parameter_offset); + else + nulls[i++] = 1; + if (span->deparse_info_offset != -1 && qbuffer_size > 0 && qbuffer_size > span->deparse_info_offset) + values[i++] = CStringGetTextDatum(qbuffer + span->deparse_info_offset); + } + + for (int j = i; j < PG_TRACING_TRACES_COLS; j++) + nulls[j] = 1; + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); +} + +/* + * Return spans as a result set. + * + * Accept a consume parameter. When consume is set, + * we empty the shared buffer and truncate query text. + */ +Datum +pg_tracing_spans(PG_FUNCTION_ARGS) +{ + bool consume; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Span *span; + const char *qbuffer; + Size qbuffer_size = 0; + + consume = PG_GETARG_BOOL(0); + if (!pg_tracing) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("pg_tracing must be loaded via shared_preload_libraries"))); + InitMaterializedSRF(fcinfo, 0); + + /* + * If this query was sampled and we're consuming tracing_spans buffer, the + * spans will target a query string that doesn't exist anymore in the + * query file. Better abort the sampling, clean ongoing traces and remove + * any possible parallel context pushed. Since this will be called within + * an ExecutorRun, we will need to check for current_trace_spans at the + * end of the ExecutorRun hook. + */ + cleanup_tracing(); + + qbuffer = qtext_load_file(&qbuffer_size); + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + for (int i = 0; i < shared_spans->end; i++) + { + span = shared_spans->spans + i; + add_result_span(rsinfo, span, qbuffer, qbuffer_size); + } + + /* Consume is set, remove spans from the shared buffer */ + if (consume) + { + shared_spans->end = 0; + /* We only truncate query file if there's no active writers */ + if (s->n_writers == 0) + { + s->extent = 0; + pg_truncate(PG_TRACING_TEXT_FILE, 0); + } + else + s->stats.failed_truncates++; + } + s->stats.last_consume = GetCurrentTimestamp(); + SpinLockRelease(&s->mutex); + } + return (Datum) 0; +} + +/* + * Return statistics of pg_tracing. + */ +Datum +pg_tracing_info(PG_FUNCTION_ARGS) +{ +#define PG_TRACING_INFO_COLS 6 + pgTracingStats stats; + TupleDesc tupdesc; + Datum values[PG_TRACING_INFO_COLS] = {0}; + bool nulls[PG_TRACING_INFO_COLS] = {0}; + int i = 0; + + if (!pg_tracing) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("pg_tracing must be loaded via shared_preload_libraries"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + /* Get a copy of the pg_tracing stats */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + stats = s->stats; + SpinLockRelease(&s->mutex); + } + + values[i++] = Int64GetDatum(stats.traces); + values[i++] = Int64GetDatum(stats.spans); + values[i++] = Int64GetDatum(stats.dropped_spans); + values[i++] = Int64GetDatum(stats.failed_truncates); + values[i++] = TimestampTzGetDatum(stats.last_consume); + values[i++] = TimestampTzGetDatum(stats.stats_reset); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +/* + * Get an empty pgTracingStats + */ +static pgTracingStats +get_empty_pg_tracing_stats(void) +{ + pgTracingStats stats; + + stats.traces = 0; + stats.spans = 0; + stats.dropped_spans = 0; + stats.failed_truncates = 0; + stats.last_consume = 0; + stats.stats_reset = GetCurrentTimestamp(); + return stats; +} + +/* + * Reset pg_tracing statistics. + */ +Datum +pg_tracing_reset(PG_FUNCTION_ARGS) +{ + /* + * Reset statistics for pg_tracing since all entries are removed. + */ + pgTracingStats empty_stats = get_empty_pg_tracing_stats(); + + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->stats = empty_stats; + SpinLockRelease(&s->mutex); + } + PG_RETURN_VOID(); +} diff --git a/contrib/pg_tracing/pg_tracing.conf b/contrib/pg_tracing/pg_tracing.conf new file mode 100644 index 0000000000..ad6b5e7c1d --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.conf @@ -0,0 +1 @@ +shared_preload_libraries = 'pg_tracing' diff --git a/contrib/pg_tracing/pg_tracing.control b/contrib/pg_tracing/pg_tracing.control new file mode 100644 index 0000000000..efa78a6610 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.control @@ -0,0 +1,5 @@ +# pg_tracing extension +comment = 'Distributed tracing for postgres' +default_version = '1.0' +module_pathname = '$libdir/pg_tracing' +relocatable = true diff --git a/contrib/pg_tracing/pg_tracing.h b/contrib/pg_tracing/pg_tracing.h new file mode 100644 index 0000000000..90c2ff6106 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing.h @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * pg_tracing.h + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing.h + *------------------------------------------------------------------------- + */ +#ifndef _PG_TRACING_H_ +#define _PG_TRACING_H_ + +#include "postgres.h" +#include "pg_tracing_span.h" + +#include "storage/s_lock.h" + +/* + * Global statistics for pg_tracing + */ +typedef struct pgTracingStats +{ + int64 traces; /* number of traces processed */ + int64 spans; /* number of spans processed */ + int64 dropped_spans; /* number of dropped spans due to full buffer */ + int64 failed_truncates; /* number of failed query file truncates + * due to active writer */ + TimestampTz last_consume; /* Last time the shared spans buffer was + * consumed */ + TimestampTz stats_reset; /* Last time stats were reset */ +} pgTracingStats; + +/* + * Global shared state + */ +typedef struct pgTracingSharedState +{ + slock_t mutex; /* protects shared stats fields and + * shared_spans buffer */ + Size extent; /* current extent of query file */ + int n_writers; /* number of active writers to query file */ + pgTracingStats stats; /* global statistics for pg_tracing */ +} pgTracingSharedState; + +#endif diff --git a/contrib/pg_tracing/pg_tracing_explain.c b/contrib/pg_tracing/pg_tracing_explain.c new file mode 100644 index 0000000000..3c7d3a428f --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_explain.c @@ -0,0 +1,552 @@ +/*------------------------------------------------------------------------- + * + * pg_tracing_explain.c + * + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing_explain.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_tracing_explain.h" + +#include "nodes/makefuncs.h" +#include "utils/builtins.h" +#include "parser/parsetree.h" +#include "nodes/extensible.h" +#include "utils/lsyscache.h" +#include "utils/ruleutils.h" + + +/* + * Add the target relation of a scan or modity node to the stringinfo + */ +static void +ExplainTargetRel(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, Index rti, StringInfo str) +{ + char *objectname = NULL; + char *namespace = NULL; + RangeTblEntry *rte; + char *refname; + + List *rtable; + + rtable = planstate->state->es_range_table; + rte = rt_fetch(rti, rtable); + + refname = (char *) list_nth(planstateTraceContext->rtable_names, rti - 1); + if (refname == NULL) + refname = rte->eref->aliasname; + + switch (nodeTag(planstate->plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_IndexScan: + case T_IndexOnlyScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_ForeignScan: + case T_CustomScan: + case T_ModifyTable: + /* Assert it's on a real relation */ + Assert(rte->rtekind == RTE_RELATION); + objectname = get_rel_name(rte->relid); + break; + case T_FunctionScan: + { + FunctionScan *fscan = (FunctionScan *) planstate->plan; + + /* Assert it's on a RangeFunction */ + Assert(rte->rtekind == RTE_FUNCTION); + + /* + * If the expression is still a function call of a single + * function, we can get the real name of the function. + * Otherwise, punt. (Even if it was a single function call + * originally, the optimizer could have simplified it away.) + */ + if (list_length(fscan->functions) == 1) + { + RangeTblFunction *rtfunc = (RangeTblFunction *) linitial(fscan->functions); + + if (IsA(rtfunc->funcexpr, FuncExpr)) + { + FuncExpr *funcexpr = (FuncExpr *) rtfunc->funcexpr; + Oid funcid = funcexpr->funcid; + + objectname = get_func_name(funcid); + } + } + } + break; + case T_TableFuncScan: + Assert(rte->rtekind == RTE_TABLEFUNC); + objectname = "xmltable"; + break; + case T_ValuesScan: + Assert(rte->rtekind == RTE_VALUES); + break; + case T_CteScan: + /* Assert it's on a non-self-reference CTE */ + Assert(rte->rtekind == RTE_CTE); + Assert(!rte->self_reference); + objectname = rte->ctename; + break; + case T_NamedTuplestoreScan: + Assert(rte->rtekind == RTE_NAMEDTUPLESTORE); + objectname = rte->enrname; + break; + case T_WorkTableScan: + /* Assert it's on a self-reference CTE */ + Assert(rte->rtekind == RTE_CTE); + Assert(rte->self_reference); + objectname = rte->ctename; + break; + default: + break; + } + + appendStringInfoString(str, " on"); + if (namespace != NULL) + appendStringInfo(str, " %s.%s", quote_identifier(namespace), + quote_identifier(objectname)); + else if (objectname != NULL) + appendStringInfo(str, " %s", quote_identifier(objectname)); + if (objectname == NULL || strcmp(refname, objectname) != 0) + appendStringInfo(str, " %s", quote_identifier(refname)); +} + +/* + * Add the target of a Scan node to the stringinfo + */ +static void +ExplainScanTarget(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const Scan *plan, StringInfo str) +{ + ExplainTargetRel(planstateTraceContext, planstate, plan->scanrelid, str); +} + +/* + * Add the target of a ModifyTable node to the stringinfo + */ +static void +ExplainModifyTarget(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const ModifyTable *plan, StringInfo str) +{ + ExplainTargetRel(planstateTraceContext, planstate, plan->nominalRelation, str); +} + +/* + * Add some additioanl details about an IndexScan or IndexOnlyScan to the stringinfo + */ +static void +ExplainIndexScanDetails(Oid indexId, ScanDirection indexorderdir, + StringInfo str) +{ + const char *indexname = get_rel_name(indexId); + + if (ScanDirectionIsBackward(indexorderdir)) + appendStringInfoString(str, " Backward"); + appendStringInfo(str, " using %s", quote_identifier(indexname)); +} + +static void +plan_to_rel_name(const PlanState *planstate, const planstateTraceContext * planstateTraceContext, StringInfo str) +{ + const Plan *plan = planstate->plan; + + switch (nodeTag(plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_SubqueryScan: + case T_FunctionScan: + case T_TableFuncScan: + case T_ValuesScan: + case T_CteScan: + case T_WorkTableScan: + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) plan, str); + break; + case T_ForeignScan: + case T_CustomScan: + if (((Scan *) plan)->scanrelid > 0) + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) plan, str); + break; + case T_IndexScan: + { + IndexScan *indexscan = (IndexScan *) plan; + + ExplainIndexScanDetails(indexscan->indexid, + indexscan->indexorderdir, + str); + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) indexscan, str); + } + break; + case T_IndexOnlyScan: + { + IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; + + ExplainIndexScanDetails(indexonlyscan->indexid, + indexonlyscan->indexorderdir, + str); + ExplainScanTarget(planstateTraceContext, planstate, (Scan *) indexonlyscan, str); + } + break; + case T_BitmapIndexScan: + { + BitmapIndexScan *bitmapindexscan = (BitmapIndexScan *) plan; + const char *indexname = get_rel_name(bitmapindexscan->indexid); + + appendStringInfo(str, " on %s", + quote_identifier(indexname)); + } + break; + case T_ModifyTable: + ExplainModifyTarget(planstateTraceContext, planstate, (ModifyTable *) plan, str); + break; + case T_NestLoop: + case T_MergeJoin: + case T_HashJoin: + { + const char *jointype; + + switch (((Join *) plan)->jointype) + { + case JOIN_INNER: + jointype = "Inner"; + break; + case JOIN_LEFT: + jointype = "Left"; + break; + case JOIN_FULL: + jointype = "Full"; + break; + case JOIN_RIGHT: + jointype = "Right"; + break; + case JOIN_SEMI: + jointype = "Semi"; + break; + case JOIN_ANTI: + jointype = "Anti"; + break; + case JOIN_RIGHT_ANTI: + jointype = "Right Anti"; + break; + default: + jointype = "???"; + break; + } + if (((Join *) plan)->jointype != JOIN_INNER) + appendStringInfo(str, " %s Join", jointype); + else if (!IsA(plan, NestLoop)) + appendStringInfoString(str, " Join"); + } + break; + case T_SetOp: + { + const char *setopcmd; + + switch (((SetOp *) plan)->cmd) + { + case SETOPCMD_INTERSECT: + setopcmd = "Intersect"; + break; + case SETOPCMD_INTERSECT_ALL: + setopcmd = "Intersect All"; + break; + case SETOPCMD_EXCEPT: + setopcmd = "Except"; + break; + case SETOPCMD_EXCEPT_ALL: + setopcmd = "Except All"; + break; + default: + setopcmd = "???"; + break; + } + appendStringInfo(str, " %s", setopcmd); + } + break; + default: + break; + } +} + +static void +add_scan_qual(StringInfo str, const PlanState *planstate, List *qual, + const char *qlabel, List *ancestors, List *deparse_ctx, + bool useprefix) +{ + List *context; + char *exprstr; + Node *node; + + if (qual == NIL) + return; + + node = (Node *) make_ands_explicit(qual); + + if (deparse_ctx != NULL) + { + /* Set up deparsing context */ + context = set_deparse_context_plan(deparse_ctx, + planstate->plan, ancestors); + + /* Deparse the expression */ + exprstr = deparse_expression(node, context, useprefix, false); + if (str->len > 0) + appendStringInfoChar(str, '|'); + appendStringInfoString(str, qlabel); + appendStringInfoString(str, exprstr); + } +} + +const char * +plan_to_deparse_info(const planstateTraceContext * planstateTraceContext, const PlanState *planstate) +{ + StringInfo deparse_info = makeStringInfo(); + Plan const *plan = planstate->plan; + List *ancestors = planstateTraceContext->ancestors; + List *deparse_ctx = planstateTraceContext->deparse_ctx; + + switch (nodeTag(plan)) + { + case T_IndexScan: + add_scan_qual(deparse_info, planstate, ((IndexScan *) plan)->indexqualorig, "Index Cond: ", + ancestors, deparse_ctx, false); + break; + case T_IndexOnlyScan: + add_scan_qual(deparse_info, planstate, ((IndexOnlyScan *) plan)->indexqual, "Index Cond: ", + ancestors, deparse_ctx, false); + break; + case T_BitmapIndexScan: + add_scan_qual(deparse_info, planstate, ((BitmapIndexScan *) plan)->indexqualorig, "Index Cond: ", + ancestors, deparse_ctx, false); + break; + case T_BitmapHeapScan: + add_scan_qual(deparse_info, planstate, ((BitmapHeapScan *) plan)->bitmapqualorig, "Recheck Cond: ", + ancestors, deparse_ctx, false); + break; + case T_SeqScan: + case T_ValuesScan: + case T_CteScan: + case T_NamedTuplestoreScan: + case T_WorkTableScan: + case T_SubqueryScan: + case T_Gather: + case T_GatherMerge: + case T_FunctionScan: + case T_TableFuncScan: + case T_TidRangeScan: + case T_ForeignScan: + case T_CustomScan: + case T_Agg: + case T_WindowAgg: + case T_Group: + case T_Result: + add_scan_qual(deparse_info, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + case T_NestLoop: + add_scan_qual(deparse_info, planstate, ((NestLoop *) plan)->join.joinqual, "Join Filter : ", + ancestors, deparse_ctx, false); + add_scan_qual(deparse_info, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + case T_MergeJoin: + add_scan_qual(deparse_info, planstate, ((MergeJoin *) plan)->mergeclauses, "Merge Cond: ", + ancestors, deparse_ctx, false); + add_scan_qual(deparse_info, planstate, ((MergeJoin *) plan)->join.joinqual, "Join Filter: ", + ancestors, deparse_ctx, false); + add_scan_qual(deparse_info, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + case T_HashJoin: + add_scan_qual(deparse_info, planstate, ((HashJoin *) plan)->hashclauses, "Hash Cond: ", + ancestors, deparse_ctx, false); + add_scan_qual(deparse_info, planstate, ((MergeJoin *) plan)->join.joinqual, "Join Filter: ", + ancestors, deparse_ctx, false); + add_scan_qual(deparse_info, planstate, plan->qual, "Filter : ", + ancestors, deparse_ctx, false); + break; + default: + break; + } + return deparse_info->data; +} + +char const * +plan_to_operation(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const char *span_type) +{ + StringInfo operation_name = makeStringInfo(); + Plan const *plan = planstate->plan; + + if (plan->parallel_aware) + appendStringInfoString(operation_name, "Parallel "); + if (plan->async_capable) + appendStringInfoString(operation_name, "Async "); + appendStringInfoString(operation_name, span_type); + + plan_to_rel_name(planstate, planstateTraceContext, operation_name); + + return operation_name->data; +} + +/* + * Get the node type name from a plan node + */ +const char * +plan_to_node_type(const Plan *plan) +{ + const char *custom_name; + + switch (nodeTag(plan)) + { + case T_Result: + return "Result"; + case T_ProjectSet: + return "ProjectSet"; + case T_ModifyTable: + switch (((ModifyTable *) plan)->operation) + { + case CMD_INSERT: + return "Insert"; + case CMD_UPDATE: + return "Update"; + case CMD_DELETE: + return "Delete"; + case CMD_MERGE: + return "Merge"; + default: + return "???"; + } + case T_Append: + return "Append"; + case T_MergeAppend: + return "MergeAppend"; + case T_RecursiveUnion: + return "RecursiveUnion"; + case T_BitmapAnd: + return "BitmapAnd"; + case T_BitmapOr: + return "BitmapOr"; + case T_NestLoop: + return "NestedLoop"; + case T_MergeJoin: + return "Merge"; /* "Join" gets added by jointype switch */ + case T_HashJoin: + return "Hash"; /* "Join" gets added by jointype switch */ + case T_SeqScan: + return "SeqScan"; + case T_SampleScan: + return "SampleScan"; + case T_Gather: + return "Gather"; + case T_GatherMerge: + return "GatherMerge"; + case T_IndexScan: + return "IndexScan"; + case T_IndexOnlyScan: + return "IndexOnlyScan"; + case T_BitmapIndexScan: + return "BitmapIndexScan"; + case T_BitmapHeapScan: + return "BitmapHeapScan"; + case T_TidScan: + return "TidScan"; + case T_TidRangeScan: + return "TidRangeScan"; + case T_SubqueryScan: + return "SubqueryScan"; + case T_FunctionScan: + return "FunctionScan"; + case T_TableFuncScan: + return "TableFunctionScan"; + case T_ValuesScan: + return "ValuesScan"; + case T_CteScan: + return "CTEScan"; + case T_NamedTuplestoreScan: + return "NamedTuplestoreScan"; + case T_WorkTableScan: + return "WorkTableScan"; + case T_ForeignScan: + switch (((ForeignScan *) plan)->operation) + { + case CMD_SELECT: + return "ForeignScan"; + case CMD_INSERT: + return "ForeignInsert"; + case CMD_UPDATE: + return "ForeignUpdate"; + case CMD_DELETE: + return "ForeignDelete"; + default: + return "???"; + } + case T_CustomScan: + custom_name = ((CustomScan *) plan)->methods->CustomName; + if (custom_name) + return psprintf("CustomScan (%s)", custom_name); + else + return "CustomScan"; + case T_Material: + return "Materialize"; + case T_Memoize: + return "Memoize"; + case T_Sort: + return "Sort"; + case T_IncrementalSort: + return "IncrementalSort"; + case T_Group: + return "Group"; + case T_Agg: + { + Agg *agg = (Agg *) plan; + + switch (agg->aggstrategy) + { + case AGG_PLAIN: + return "Aggregate"; + case AGG_SORTED: + return "GroupAggregate"; + case AGG_HASHED: + return "HashAggregate"; + case AGG_MIXED: + return "MixedAggregate"; + default: + return "Aggregate ???"; + } + } + case T_WindowAgg: + return "WindowAgg"; + case T_Unique: + return "Unique"; + case T_SetOp: + switch (((SetOp *) plan)->strategy) + { + case SETOP_SORTED: + return "SetOp"; + case SETOP_HASHED: + return "HashSetOp"; + default: + return "SetOp ???"; + } + case T_LockRows: + return "LockRows"; + case T_Limit: + return "Limit"; + case T_Hash: + return "Hash"; + default: + return "???"; + } +} diff --git a/contrib/pg_tracing/pg_tracing_explain.h b/contrib/pg_tracing/pg_tracing_explain.h new file mode 100644 index 0000000000..6c73c56c1b --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_explain.h @@ -0,0 +1,21 @@ +#ifndef _PG_TRACING_EXPLAIN_H_ +#define _PG_TRACING_EXPLAIN_H_ + +#include "pg_tracing_span.h" +#include "executor/execdesc.h" + +/* Context needed when generating spans from planstate */ +typedef struct planstateTraceContext +{ + const pgTracingTraceContext *trace_context; + int sql_error_code; + List *ancestors; + List *deparse_ctx; + List *rtable_names; +} planstateTraceContext; + +extern const char *plan_to_node_type(const Plan *plan); +extern const char *plan_to_operation(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const char *spanName); +extern const char *plan_to_deparse_info(const planstateTraceContext * planstateTraceContext, const PlanState *planstate); + +#endif diff --git a/contrib/pg_tracing/pg_tracing_parallel.c b/contrib/pg_tracing/pg_tracing_parallel.c new file mode 100644 index 0000000000..48d09d0742 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_parallel.c @@ -0,0 +1,103 @@ +/*------------------------------------------------------------------------- + * pg_tracing_parallel.c + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing_parallel.c + *------------------------------------------------------------------------- + */ +#include "pg_tracing_parallel.h" + +#include "storage/shmem.h" +#include "storage/spin.h" + +/* Shared buffer storing trace context for parallel workers. */ +static pgTracingParallelWorkers * pg_tracing_parallel = NULL; + +/* Index of the parallel worker context shared buffer if any */ +static int parallel_context_index = -1; + +void +pg_tracing_shmem_parallel_startup(void) +{ + bool found_parallel; + + /* We won't have more than max_parallel_workers workers */ + pg_tracing_parallel = ShmemInitStruct("PgTracing Parallel Workers Context", + sizeof(pgTracingParallelWorkers) + max_parallel_workers * sizeof(pgTracingParallelContext), + &found_parallel); + if (!found_parallel) + { + SpinLockInit(&pg_tracing_parallel->mutex); + for (int i = 0; i < max_parallel_workers; i++) + pg_tracing_parallel->trace_contexts[i].leader_backend_id = InvalidBackendId; + } +} + +/* + * Push trace context to the shared parallel worker buffer + */ +void +add_parallel_context(const struct pgTracingTraceContext *trace_context, + uint64 parent_id, uint64 query_id) +{ + volatile pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel; + + Assert(parallel_context_index == -1); + SpinLockAcquire(&p->mutex); + for (int i = 0; i < max_parallel_workers; i++) + { + volatile pgTracingParallelContext *ctx = p->trace_contexts + i; + + Assert(ctx->leader_backend_id != MyBackendId); + if (ctx->leader_backend_id != InvalidBackendId) + continue; + /* Slot is available */ + parallel_context_index = i; + ctx->leader_backend_id = MyBackendId; + ctx->trace_context = *trace_context; + /* We don't need to propagate root span index to parallel workers */ + ctx->trace_context.root_span_index = -1; + ctx->trace_context.parent_id = parent_id; + break; + } + SpinLockRelease(&p->mutex); +} + +/* + * Remove parallel context for the current leader from the shared memory. + */ +void +remove_parallel_context(void) +{ + if (parallel_context_index < 0) + /* No tracing of parallel workers */ + return; + { + volatile pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel; + + SpinLockAcquire(&p->mutex); + p->trace_contexts[parallel_context_index].leader_backend_id = InvalidBackendId; + SpinLockRelease(&p->mutex); + } + parallel_context_index = -1; +} + +/* + * If we're inside a parallel worker, check if the trace context is stored in shared memory. + * If a trace context exists, it means that the query is sampled and worker tracing is enabled. + */ +void +fetch_parallel_context(pgTracingTraceContext * trace_context) +{ + volatile pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel; + + SpinLockAcquire(&p->mutex); + for (int i = 0; i < max_parallel_workers; i++) + { + if (p->trace_contexts[i].leader_backend_id != ParallelLeaderBackendId) + continue; + /* Found a matching a trace context, fetch it */ + *trace_context = p->trace_contexts[i].trace_context; + } + SpinLockRelease(&p->mutex); +} diff --git a/contrib/pg_tracing/pg_tracing_parallel.h b/contrib/pg_tracing/pg_tracing_parallel.h new file mode 100644 index 0000000000..5e9436a10d --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_parallel.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * pg_tracing_parallel.h + * Store, retrieve and remove trace context for parallel workers. + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing_parallel.h + *------------------------------------------------------------------------- + */ +#ifndef _PG_TRACING_PARALLEL_H_ +#define _PG_TRACING_PARALLEL_H_ + +#include "postgres.h" +#include "pg_tracing_span.h" + +#include "storage/s_lock.h" +#include "storage/backendid.h" + +/* + * A trace context for a specific parallel context + */ +typedef struct pgTracingParallelContext +{ + BackendId leader_backend_id; /* Backend id of the leader, set to + * InvalidBackendId if unused */ + pgTracingTraceContext trace_context; +} pgTracingParallelContext; + +/* + * Store context for parallel workers + */ +typedef struct pgTracingParallelWorkers +{ + slock_t mutex; + pgTracingParallelContext trace_contexts[FLEXIBLE_ARRAY_MEMBER]; +} pgTracingParallelWorkers; + + +extern void pg_tracing_shmem_parallel_startup(void); +extern void add_parallel_context(const struct pgTracingTraceContext *trace_context, + uint64 parent_id, uint64 query_id); +extern void remove_parallel_context(void); +extern void fetch_parallel_context(pgTracingTraceContext * trace_context); + +#endif diff --git a/contrib/pg_tracing/pg_tracing_query_process.c b/contrib/pg_tracing/pg_tracing_query_process.c new file mode 100644 index 0000000000..7d76129b84 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_query_process.c @@ -0,0 +1,478 @@ +#include "pg_tracing_query_process.h" +#include "pg_tracing.h" + +#include +#include +#include "parser/scanner.h" +#include "nodes/extensible.h" + +/* + * Extract traceparent parameters from SQLCommenter + * + * We're expecting the query to start with a SQLComment containing the + * traceparent parameter + * "/\*traceparent='00-00000000000000000000000000000009-0000000000000005-01'*\/ + * SELECT 1;" + * Traceparent has the following format: version-traceid-parentid-sampled + * We also accept SQLCommenter as a parameter. In this case, we don't have + * the start and end of comments. + * + * if is_parameter is true, the SQLComment is passed as a query parameter and + * won't have the comment start and end. + */ +void +extract_trace_context_from_query(pgTracingTraceContext * trace_context, const char *sqlcomment_str, bool is_parameter) +{ + const char *expected_start = "/*"; + const char *traceparent; + const char *end_sqlcomment; + char *endptr; + + if (!is_parameter) + { + /* + * Look for the start of a comment We're expecting the comment to be + * at the very start of the query + */ + for (size_t i = 0; i < strlen(expected_start); i++) + { + if (sqlcomment_str[i] != expected_start[i]) + return; + } + + /* + * Check that we have an end + */ + end_sqlcomment = strstr(sqlcomment_str, "*/"); + if (end_sqlcomment == NULL) + return; + } + else + end_sqlcomment = sqlcomment_str + strlen(sqlcomment_str); + + /* + * Locate traceparent parameter and make sure it has the expected size + * "traceparent" + "=" + '' -> 13 characters + * 00-00000000000000000000000000000009-0000000000000005-01 -> 55 + * characters + */ + traceparent = strstr(sqlcomment_str, "traceparent='"); + if (traceparent == NULL || traceparent > end_sqlcomment || end_sqlcomment - traceparent < 55 + 13) + return; + + /* + * Move to the start of the traceparent values + */ + traceparent = traceparent + 13; + + /* + * Check that '-' are at the expected places + */ + if (traceparent[2] != '-' || traceparent[35] != '-' || traceparent[52] != '-') + return; + + /* + * Parse traceparent parameters + */ + errno = 0; + trace_context->trace_id = strtol(&traceparent[3], &endptr, 16); + if (endptr != traceparent + 35 || errno) + return; + trace_context->parent_id = strtol(&traceparent[36], &endptr, 16); + if (endptr != traceparent + 52 || errno) + return; + trace_context->sampled = strtol(&traceparent[53], &endptr, 16); + if (endptr != traceparent + 55 || errno) + + /* + * Just to be sure, reset sampled on error + */ + trace_context->sampled = 0; +} + + +/* + * comp_location: comparator for qsorting LocationLen structs by location + */ +static int +comp_location(const void *a, const void *b) +{ + int l = ((const LocationLen *) a)->location; + int r = ((const LocationLen *) b)->location; + + if (l < r) + return -1; + else if (l > r) + return +1; + else + return 0; +} + + +/* + * Normalise query and fill param_str. + * Normalised query will separate tokens with a single space and + * parameters are replaced by $1, $2... + * Parameters are put in the param_str wich will contain all parameters values + * using the format: "$1 = 0, $2 = 'v'" + */ +const char * +normalise_query_parameters(const JumbleState *jstate, const char *query, + int query_loc, int *query_len_p, char **param_str, int *param_len) +{ + char *norm_query; + int query_len = *query_len_p; + int norm_query_buflen, /* Space allowed for norm_query */ + n_quer_loc = 0; + LocationLen *locs; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + int current_loc = 0; + StringInfoData buf; + + initStringInfo(&buf); + + if (query_loc == -1) + { + /* If query location is unknown, distrust query_len as well */ + query_loc = 0; + query_len = strlen(query); + } + else + { + /* Length of 0 (or -1) means "rest of string" */ + if (query_len <= 0) + query_len = strlen(query); + else + Assert(query_len <= strlen(query)); + } + + norm_query_buflen = query_len + jstate->clocations_count * 10; + Assert(norm_query_buflen > 0); + locs = jstate->clocations; + + /* Allocate result buffer */ + norm_query = palloc(norm_query_buflen + 1); + + if (jstate->clocations_count > 1) + qsort(jstate->clocations, jstate->clocations_count, + sizeof(LocationLen), comp_location); + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query + query_loc, + &yyextra, + &ScanKeywords, + ScanKeywordTokens); + + for (;;) + { + int loc = locs[current_loc].location; + int tok; + + loc -= query_loc; + + tok = core_yylex(&yylval, &yylloc, yyscanner); + + /* + * We should not hit end-of-string, but if we do, behave sanely + */ + if (tok == 0) + break; /* out of inner for-loop */ + if (yylloc > query_len) + break; + + /* + * We should find the token position exactly, but if we somehow run + * past it, work with that. + */ + if (current_loc < jstate->clocations_count && yylloc >= loc) + { + appendStringInfo(&buf, + "%s$%d = ", + current_loc > 0 ? ", " : "", + current_loc + 1); + if (query[loc] == '-') + { + /* + * It's a negative value - this is the one and only case where + * we replace more than a single token. + * + * Do not compensate for the core system's special-case + * adjustment of location to that of the leading '-' operator + * in the event of a negative constant. It is also useful for + * our purposes to start from the minus symbol. In this way, + * queries like "select * from foo where bar = 1" and "select * + * from foo where bar = -2" will have identical normalized + * query strings. + */ + appendStringInfoChar(&buf, '-'); + tok = core_yylex(&yylval, &yylloc, yyscanner); + if (tok == 0) + break; /* out of inner for-loop */ + } + if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0) + { + norm_query[n_quer_loc++] = ' '; + } + + /* + * Append the current parameter $x in the normalised query + */ + n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d", + current_loc + 1 + jstate->highest_extern_param_id); + + appendStringInfoString(&buf, yyextra.scanbuf + yylloc); + + current_loc++; + } + else + { + int to_copy; + + if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0) + { + norm_query[n_quer_loc++] = ' '; + } + to_copy = strlen(yyextra.scanbuf + yylloc); + Assert(n_quer_loc + to_copy < norm_query_buflen + 1); + memcpy(norm_query + n_quer_loc, yyextra.scanbuf + yylloc, to_copy); + n_quer_loc += to_copy; + } + } + scanner_finish(yyscanner); + + *query_len_p = n_quer_loc; + norm_query[n_quer_loc] = '\0'; + *param_str = buf.data; + *param_len = buf.len; + return norm_query; +} + +/* + * Normalise query: tokens will be separated by a single space + */ +const char * +normalise_query(const char *query, int query_loc, int *query_len_p) +{ + char *norm_query; + int query_len = *query_len_p; + int norm_query_buflen = query_len; + int n_quer_loc = 0; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + + /* Allocate result buffer */ + norm_query = palloc(norm_query_buflen + 2); + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query + query_loc, &yyextra, &ScanKeywords, ScanKeywordTokens); + for (;;) + { + int tok; + int to_copy; + + tok = core_yylex(&yylval, &yylloc, yyscanner); + + if (tok == 0) + break; /* out of inner for-loop */ + if (yylloc > query_len) + break; + + if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0) + { + norm_query[n_quer_loc++] = ' '; + } + to_copy = strlen(yyextra.scanbuf + yylloc); + Assert(n_quer_loc + to_copy < norm_query_buflen + 2); + memcpy(norm_query + n_quer_loc, yyextra.scanbuf + yylloc, to_copy); + n_quer_loc += to_copy; + } + scanner_finish(yyscanner); + + *query_len_p = n_quer_loc; + norm_query[n_quer_loc] = '\0'; + return norm_query; +} + +/* + * Store text in the pg_tracing stat file + */ +bool +text_store_file(pgTracingSharedState * pg_tracing, const char *text, int text_len, + Size *query_offset) +{ + Size off; + int fd; + + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + off = s->extent; + s->extent += text_len + 1; + s->n_writers++; + SpinLockRelease(&s->mutex); + } + + /* + * Don't allow the file to grow larger than what qtext_load_file can + * (theoretically) handle. This has been seen to be reachable on 32-bit + * platforms. + */ + if (unlikely(text_len >= MaxAllocHugeSize - off)) + { + errno = EFBIG; /* not quite right, but it'll do */ + fd = -1; + goto error; + } + + /* Now write the data into the successfully-reserved part of the file */ + fd = OpenTransientFile(PG_TRACING_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + goto error; + + if (pg_pwrite(fd, text, text_len, off) != text_len) + goto error; + if (pg_pwrite(fd, "\0", 1, off + text_len) != 1) + goto error; + + CloseTransientFile(fd); + + /* Mark our write complete */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->n_writers--; + SpinLockRelease(&s->mutex); + } + + /* + * Set offset once write was succesful + */ + *query_offset = off; + + return true; + +error: + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + + if (fd >= 0) + CloseTransientFile(fd); + + /* Mark our write complete */ + { + volatile pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing; + + SpinLockAcquire(&s->mutex); + s->n_writers--; + SpinLockRelease(&s->mutex); + } + + return false; +} + +/* + * Read the external query text file into a malloc'd buffer. + * + * Returns NULL (without throwing an error) if unable to read, eg file not + * there or insufficient memory. + * + * On success, the buffer size is also returned into *buffer_size. + * + * This can be called without any lock on pgss->lock, but in that case the + * caller is responsible for verifying that the result is sane. + */ +const char * +qtext_load_file(Size *buffer_size) +{ + char *buf; + int fd; + struct stat stat; + Size nread; + + fd = OpenTransientFile(PG_TRACING_TEXT_FILE, O_RDONLY | PG_BINARY); + if (fd < 0) + { + if (errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + return NULL; + } + + /* Get file length */ + if (fstat(fd, &stat)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + CloseTransientFile(fd); + return NULL; + } + + /* Allocate buffer; beware that off_t might be wider than size_t */ + if (stat.st_size <= MaxAllocHugeSize) + buf = (char *) malloc(stat.st_size); + else + buf = NULL; + if (buf == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Could not allocate enough memory to read file \"%s\".", + PG_TRACING_TEXT_FILE))); + CloseTransientFile(fd); + return NULL; + } + + /* + * OK, slurp in the file. Windows fails if we try to read more than + * INT_MAX bytes at once, and other platforms might not like that either, + * so read a very large file in 1GB segments. + */ + nread = 0; + while (nread < stat.st_size) + { + int toread = Min(1024 * 1024 * 1024, stat.st_size - nread); + + /* + * If we get a short read and errno doesn't get set, the reason is + * probably that garbage collection truncated the file since we did + * the fstat(), so we don't log a complaint --- but we don't return + * the data, either, since it's most likely corrupt due to concurrent + * writes from garbage collection. + */ + errno = 0; + if (read(fd, buf + nread, toread) != toread) + { + if (errno) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + PG_TRACING_TEXT_FILE))); + free(buf); + CloseTransientFile(fd); + return NULL; + } + nread += toread; + } + + if (CloseTransientFile(fd) != 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", PG_TRACING_TEXT_FILE))); + + *buffer_size = nread; + return buf; +} diff --git a/contrib/pg_tracing/pg_tracing_query_process.h b/contrib/pg_tracing/pg_tracing_query_process.h new file mode 100644 index 0000000000..bf568f97e9 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_query_process.h @@ -0,0 +1,30 @@ +#ifndef _QUERY_PROCESS_H_ +#define _QUERY_PROCESS_H_ + +#include "pg_tracing.h" +#include "nodes/queryjumble.h" +#include "parser/parse_node.h" + +/* + * Normalise query: - Comments are removed - Constants are replaced by $x - + * All tokens are separated by a single space + */ +extern const char *normalise_query_parameters(const JumbleState *jstate, const char *query, + int query_loc, int *query_len_p, char **param_str, + int *param_len); + +extern void extract_trace_context_from_query(pgTracingTraceContext * trace_context, const char *query_str, bool is_parameter); + +/* + * Normalise simple query + */ +extern const char *normalise_query(const char *query, int query_loc, int *query_len_p); +bool text_store_file(pgTracingSharedState * pg_tracing, const char *query, + int query_len, Size *query_offset); +extern const char *qtext_load_file(Size *buffer_size); +extern const char *qtext_load_file(Size *buffer_size); + +/* Location of external text file */ +#define PG_TRACING_TEXT_FILE PG_STAT_TMP_DIR "/pg_tracing.stat" + +#endif diff --git a/contrib/pg_tracing/pg_tracing_span.c b/contrib/pg_tracing/pg_tracing_span.c new file mode 100644 index 0000000000..ce2b914ff9 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_span.c @@ -0,0 +1,307 @@ +#include "pg_tracing_span.h" + +#include "nodes/extensible.h" +#include "common/pg_prng.h" + + +/* + * Convert a node CmdType to the matching SpanType + */ +SpanType +command_type_to_span_type(CmdType cmd_type) +{ + switch (cmd_type) + { + case CMD_SELECT: + return SPAN_NODE_SELECT; + case CMD_INSERT: + return SPAN_NODE_INSERT; + case CMD_UPDATE: + return SPAN_NODE_UPDATE; + case CMD_DELETE: + return SPAN_NODE_DELETE; + case CMD_MERGE: + return SPAN_NODE_MERGE; + case CMD_UTILITY: + return SPAN_NODE_UTILITY; + case CMD_NOTHING: + return SPAN_NODE_NOTHING; + case CMD_UNKNOWN: + return SPAN_NODE_UNKNOWN; + } + return SPAN_NODE_UNKNOWN; +} + +/* + * Initialize span fields + */ +static void +initialize_span_fields(Span * span, SpanType type, const pgTracingTraceContext * trace, uint64 parent_id, uint64 query_id) +{ + span->trace_id = trace->trace_id; + span->type = type; + + /* + * If parent id is unset, it means that there's no propagated trace + * informations from the caller. In this case, this is the top span, + * span_id == parent_id and we reuse the generated trace id for the span + * id. + */ + if (parent_id == 0) + { + span->parent_id = trace->trace_id; + } + else + { + span->parent_id = parent_id; + } + span->span_id = pg_prng_uint64(&pg_global_prng_state); + + span->node_type_offset = -1; + span->operation_name_offset = -1; + span->deparse_info_offset = -1; + span->parameter_offset = -1; + span->sql_error_code = 0; + span->startup = 0; + span->be_pid = MyProcPid; + span->database_id = MyDatabaseId; + span->user_id = GetUserId(); + span->subxact_count = MyProc->subxidStatus.count; + span->start_ns_time = 0; + span->duration_ns = 0; + span->ended = false; + span->query_id = query_id; + memset(&span->node_counters, 0, sizeof(NodeCounters)); + memset(&span->plan_counters, 0, sizeof(PlanCounters)); + + /* + * Store the starting buffer for planner and process utility spans + * + * TODO: Do we need wal buffer for planner? + */ + if (type == SPAN_PLANNER || span->type == SPAN_PROCESS_UTILITY) + { + /* TODO Do we need to track wal for planner? */ + span->node_counters.buffer_usage = pgBufferUsage; + span->node_counters.wal_usage = pgWalUsage; + } +} + +/* + * Set the start fields of a span + */ +void +set_span_start(const pgTracingTraceContext * trace_context, Span * span, int64 start_ns_time) +{ + int64 ns_since_trace_start; + + span->start_ns_time = start_ns_time; + + /* Get the ns between trace start and span start */ + ns_since_trace_start = start_ns_time - trace_context->start_trace.ns; + Assert(ns_since_trace_start >= 0); + + /* Fill span starts */ + span->start = trace_context->start_trace.ts + ns_since_trace_start / NS_PER_US; + span->start_ns = ns_since_trace_start % NS_PER_US; +} + +/* + * Initialize span and set span starting time. + * + * A span needs a start timestamp and a duration. + * Given that some spans can have a very short duration (less than 1us), we + * need to rely on monotonic clock as much as possible to have the best precision. + * + * For that, after we've established that a query was sampled, we get both the wall clock + * and the monotonic clock at the start of a trace. They are both stored in pgTracingTraceContext. + * All subsequent times will use the monotonic clock. + * When we need to get the start of a span, we use the starting wall clock as a starting point + * and add the time between the two monotonic clocks. + * + * start_span is an optional nanosecond time to set for. If nothing is provided, we use the current monotonic clock. + * This parameter is mostly used when generating spans from planstate as we need to rely on the query instrumentation to find the node start. + */ +void +begin_span(const pgTracingTraceContext * trace_context, Span * span, SpanType type, uint64 parent_id, uint64 query_id, const int64 *start_span) +{ + int64 start_ns_time; + + initialize_span_fields(span, type, trace_context, parent_id, query_id); + + /* If no start span is provided, get the current one */ + if (start_span == NULL) + start_ns_time = get_current_ns(); + else + start_ns_time = *start_span; + + set_span_start(trace_context, span, start_ns_time); +} + +/* + * Set span duration and accumulated buffers + * end_span_input is optional, if NULL is passed, we use + * the current time + */ +void +end_span(Span * span, const int64 *end_span_input) +{ + BufferUsage buffer_usage; + WalUsage wal_usage; + int64 end_span_ns; + + Assert(span->start_ns_time > 0); + Assert(span->trace_id > 0); + + /* Span should be ended only once */ + Assert(!span->ended); + span->ended = true; + + /* Set span duration with the end time before substrating the start */ + if (end_span_input == NULL) + end_span_ns = get_current_ns(); + else + end_span_ns = *end_span_input; + span->duration_ns = end_span_ns - span->start_ns_time; + + if (span->type == SPAN_PLANNER || span->type == SPAN_PROCESS_UTILITY) + { + /* calc differences of buffer counters. */ + memset(&buffer_usage, 0, sizeof(BufferUsage)); + BufferUsageAccumDiff(&buffer_usage, &pgBufferUsage, + &span->node_counters.buffer_usage); + span->node_counters.buffer_usage = buffer_usage; + + /* calc differences of WAL counters. */ + memset(&wal_usage, 0, sizeof(wal_usage)); + WalUsageAccumDiff(&wal_usage, &pgWalUsage, + &span->node_counters.wal_usage); + span->node_counters.wal_usage = wal_usage; + } +} + +/* + * Get the name of a span. + * If it is a node span, the name may be pulled from the stat file. + */ +const char * +get_span_name(const Span * span, const char *qbuffer, Size qbuffer_size) +{ + if (span->node_type_offset != -1 && qbuffer_size > 0 && span->node_type_offset <= qbuffer_size) + return qbuffer + span->node_type_offset; + + switch (span->type) + { + case SPAN_POST_PARSE: + return "Post Parse"; + case SPAN_PARSE: + return "Parse"; + case SPAN_PLANNER: + return "Planner"; + case SPAN_FUNCTION: + return "Function"; + case SPAN_PROCESS_UTILITY: + return "ProcessUtility"; + case SPAN_EXECUTOR_START: + return "Executor"; + case SPAN_EXECUTOR_RUN: + return "Executor"; + case SPAN_EXECUTOR_END: + return "Executor"; + case SPAN_EXECUTOR_FINISH: + return "Executor"; + + case SPAN_NODE_INIT_PLAN: + return "InitPlan"; + case SPAN_NODE_SUBPLAN: + return "SubPlan"; + + case SPAN_NODE_SELECT: + return "Select"; + case SPAN_NODE_INSERT: + return "Insert"; + case SPAN_NODE_UPDATE: + return "Update"; + case SPAN_NODE_DELETE: + return "Delete"; + case SPAN_NODE_MERGE: + return "Merge"; + case SPAN_NODE_UTILITY: + return "Utility"; + case SPAN_NODE_NOTHING: + return "Nothing"; + case SPAN_NODE_UNKNOWN: + return "Unknown"; + case SPAN_NODE: + return "Node"; + } + return "???"; +} + +/* + * Get the operation of a span. + * For node span, the name may be pulled from the stat file. + */ +const char * +get_operation_name(const Span * span, const char *qbuffer, Size qbuffer_size) +{ + if (span->operation_name_offset != -1 && qbuffer_size > 0 + && span->operation_name_offset <= qbuffer_size) + return qbuffer + span->operation_name_offset; + + switch (span->type) + { + case SPAN_POST_PARSE: + return "Post Parse"; + case SPAN_PARSE: + return "Parse"; + case SPAN_PLANNER: + return "Planner"; + case SPAN_FUNCTION: + return "Function"; + case SPAN_PROCESS_UTILITY: + return "ProcessUtility"; + case SPAN_EXECUTOR_START: + return "ExecutorStart"; + case SPAN_EXECUTOR_RUN: + return "ExecutorRun"; + case SPAN_EXECUTOR_END: + return "ExecutorEnd"; + case SPAN_EXECUTOR_FINISH: + return "ExecutorFinish"; + case SPAN_NODE_SELECT: + case SPAN_NODE_INSERT: + case SPAN_NODE_UPDATE: + case SPAN_NODE_DELETE: + case SPAN_NODE_MERGE: + case SPAN_NODE_UTILITY: + case SPAN_NODE_NOTHING: + case SPAN_NODE_INIT_PLAN: + case SPAN_NODE_SUBPLAN: + case SPAN_NODE_UNKNOWN: + case SPAN_NODE: + return "Node"; + } + return "Unknown type"; +} + +/* +* Get the current nanoseconds value +*/ +int64 +get_current_ns(void) +{ + instr_time t; + + INSTR_TIME_SET_CURRENT(t); + return INSTR_TIME_GET_NANOSEC(t); +} + +/* +* Get the end of a span in nanoseconds +*/ +int64 +get_span_end_ns(const Span * span) +{ + return span->start_ns_time + span->duration_ns; +} diff --git a/contrib/pg_tracing/pg_tracing_span.h b/contrib/pg_tracing/pg_tracing_span.h new file mode 100644 index 0000000000..9375972988 --- /dev/null +++ b/contrib/pg_tracing/pg_tracing_span.h @@ -0,0 +1,167 @@ +/*------------------------------------------------------------------------- + * contrib/pg_tracing/pg_tracing_span.h + * Header for span. + * + * IDENTIFICATION + * contrib/pg_tracing/pg_tracing_span.h + * + *------------------------------------------------------------------------- + */ +#ifndef _SPAN_H_ +#define _SPAN_H_ + +#include "postgres.h" + +#include "jit/jit.h" +#include "pgstat.h" +#include "access/transam.h" + +/* + * SpanType: Type of the span + */ +typedef enum SpanType +{ + SPAN_PARSE, /* Wraps query parsing */ + SPAN_POST_PARSE, /* Wraps post parsing */ + SPAN_PLANNER, /* Wraps planner execution in planner hook */ + SPAN_FUNCTION, /* Wraps function in fmgr hook */ + SPAN_PROCESS_UTILITY, /* Wraps ProcessUtility execution */ + + SPAN_EXECUTOR_START, /* Executor Spans wrapping the matching hooks */ + SPAN_EXECUTOR_RUN, + SPAN_EXECUTOR_END, + SPAN_EXECUTOR_FINISH, + + SPAN_NODE_INIT_PLAN, + SPAN_NODE_SUBPLAN, + + SPAN_NODE, /* Represents a node execution, generated from + * planstate */ + + SPAN_NODE_SELECT, /* Query Span types. They are created from the + * query cmdType */ + SPAN_NODE_INSERT, + SPAN_NODE_UPDATE, + SPAN_NODE_DELETE, + SPAN_NODE_MERGE, + SPAN_NODE_UTILITY, + SPAN_NODE_NOTHING, + SPAN_NODE_UNKNOWN, +} SpanType; + + +/* + * Counters extracted from query instrumentation + */ +typedef struct NodeCounters +{ + int64 rows; /* # of tuples processed */ + int64 nloops; /* # of cycles for this node */ + + BufferUsage buffer_usage; /* total buffer usage for this node */ + WalUsage wal_usage; /* total WAL usage for this node */ + JitInstrumentation jit_usage; /* total JIT usage for this node */ +} NodeCounters; + +/* + * Counters extracted from query's plan + */ +typedef struct PlanCounters +{ + /* + * estimated execution costs for plan (see costsize.c for more info) + */ + Cost startup_cost; /* cost expended before fetching any tuples */ + Cost total_cost; /* total cost (assuming all tuples fetched) */ + + /* + * planner's estimate of result size of this plan step + */ + Cardinality plan_rows; /* number of rows plan is expected to emit */ + int plan_width; /* average row width in bytes */ +} PlanCounters; + +/* + * Start time of a trace. We store both wall clock and monotonic clock. + */ +typedef struct pgTracingStartTime +{ + TimestampTz ts; /* Wall clock at the start of the trace */ + int64 ns; /* Monotonic clock at the start of the trace */ +} pgTracingStartTime; + +/* + * Trace context for an ongoing trace + */ +typedef struct pgTracingTraceContext +{ + uint64 trace_id; /* Id of the trace */ + uint64 parent_id; /* Span id of the parent */ + int sampled; /* Is current statement sampled? */ + uint64 query_id; /* Query id of the current statement */ + int root_span_index; /* span index for the root span */ + pgTracingStartTime start_trace; +} pgTracingTraceContext; + +/* + * The Span data structure represents an operation with a start, a duration + * and metadatas. + */ +typedef struct Span +{ + uint64 trace_id; /* Trace id extracted from the SQLCommenter's + * traceparent */ + uint64 span_id; /* Span Identifier generated from a random + * uint64 */ + uint64 parent_id; /* Span's parent id. For the top span, it's + * extracted from SQLCommenter's traceparent. + * For other spans, we pass the parent's span */ + uint64 query_id; /* QueryId of the trace query if available */ + + TimestampTz start; /* Start of the span */ + uint16 start_ns; /* Leftover nanoseconds of span start */ + int64 start_ns_time; /* Nanoseconds at start of the span. Internal + * usage only */ + + int64 duration_ns; /* Duration of the span in nanoseconds */ + SpanType type; /* Type of the span. Used to generate the + * span's name for all spans except SPAN_NODE */ + bool ended; /* Track if the span was already ended. + * Internal usage only */ + int be_pid; /* Pid of the backend process */ + Oid user_id; /* User ID when the span was created */ + Oid database_id; /* Database ID where the span was created */ + + uint8 subxact_count; /* Active count of backend's subtransaction */ + + /* + * We store variable size metadata in an external file. Those represent + * the position of NULL terminated strings in the file. Set to -1 if + * unused. + */ + Size node_type_offset; /* name of the node type */ + Size operation_name_offset; /* operation represented by the span */ + Size deparse_info_offset; /* deparsing info extracted from a + * plan node */ + Size parameter_offset; /* query parameters values */ + + PlanCounters plan_counters; /* Counters with plan costs */ + NodeCounters node_counters; /* Counters with node costs (jit, wal, + * buffers) */ + int64 startup; /* Time to the first tuple */ + int sql_error_code; /* query error code extracted from ErrorData, + * 0 if query was successful */ +} Span; + +extern void begin_span(const pgTracingTraceContext * trace_context, Span * span, SpanType type, uint64 parent_id, uint64 query_id, const int64 *start_span); +extern void end_span(Span * span, const int64 *end_time); +extern void set_span_start(const pgTracingTraceContext * trace_context, Span * span, int64 start_ns_time); + +extern SpanType command_type_to_span_type(CmdType cmd_type); +extern const char *get_span_name(const Span * span, const char *qbuffer, Size qbuffer_size); +extern const char *get_operation_name(const Span * span, const char *qbuffer, Size qbuffer_size); + +extern int64 get_current_ns(void); +extern int64 get_span_end_ns(const Span * span); + +#endif diff --git a/contrib/pg_tracing/sql/cleanup.sql b/contrib/pg_tracing/sql/cleanup.sql new file mode 100644 index 0000000000..7ac2eb43ef --- /dev/null +++ b/contrib/pg_tracing/sql/cleanup.sql @@ -0,0 +1,5 @@ +-- Drop test table and test functions +DROP TABLE pg_tracing_test; +DROP function test_function_project_set; +DROP function test_function_result; +DROP EXTENSION pg_tracing; diff --git a/contrib/pg_tracing/sql/extended.sql b/contrib/pg_tracing/sql/extended.sql new file mode 100644 index 0000000000..df198403f4 --- /dev/null +++ b/contrib/pg_tracing/sql/extended.sql @@ -0,0 +1,44 @@ +-- Trace everything +SET pg_tracing.sample_rate = 1.0; + +-- Simple query with extended protocol +SELECT $1, $2 \bind 1 2 \g + +-- Check generated spans +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Trigger an error due to mismatching number of parameters +BEGIN; select $1 \bind \g +ROLLBACK; + +-- Check that no spans were generated +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Execute queries with extended protocol within an explicit transaction +BEGIN; +SELECT $1 \bind 1 \g +SELECT $1, $2 \bind 2 3 \g +COMMIT; + +-- Spans within the same transaction should have been generated with the same trace_id +SELECT count(distinct(trace_id)) = 1 FROM pg_tracing_peek_spans; +-- Check generated spans order +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Mix extended protocol and simple protocol +BEGIN; +SELECT $1 \bind 1 \g +SELECT 5, 6, 7; +SELECT $1, $2 \bind 2 3 \g +COMMIT; + +-- Spans within the same transaction should have been generated with the same trace_id +SELECT count(distinct(trace_id)) = 1 FROM pg_tracing_peek_spans; +-- Check generated spans order +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- gdesc calls a single parse command then execute a query. Make sure we handle this case +\gdesc + +-- Checking gdesc generated spans +SELECT span_type, span_operation, parameters FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; diff --git a/contrib/pg_tracing/sql/insert.sql b/contrib/pg_tracing/sql/insert.sql new file mode 100644 index 0000000000..d52343b92e --- /dev/null +++ b/contrib/pg_tracing/sql/insert.sql @@ -0,0 +1,26 @@ +-- Only trace queries with sample flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE TABLE IF NOT EXISTS pg_tracing_test_table_with_constraint (a int, b char(20), CONSTRAINT PK_tracing_test PRIMARY KEY (a)); + +-- Check create statement spans +SELECT span_type, span_operation from pg_tracing_consume_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + +-- Simple insertion +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test_table_with_constraint VALUES(1, 'aaa'); + +-- Check insert spans +SELECT span_type, span_operation from pg_tracing_consume_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + +-- Trigger constraint violation +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test_table_with_constraint VALUES(1, 'aaa'); + +-- Check violation spans +SELECT span_type, span_operation, sql_error_code from pg_tracing_consume_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + +-- Trigger an error while calling pg_tracing_peek_spans which resets tracing +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test_table_with_constraint VALUES(length((select sql_error_code from public.pg_tracing_peek_spans)), 'aaa'); + +-- Nothing should be generated +SELECT span_type, span_operation, sql_error_code from pg_tracing_consume_spans where trace_id=1 order by span_start, span_start_ns, span_operation; diff --git a/contrib/pg_tracing/sql/nested.sql b/contrib/pg_tracing/sql/nested.sql new file mode 100644 index 0000000000..033704b7c3 --- /dev/null +++ b/contrib/pg_tracing/sql/nested.sql @@ -0,0 +1,204 @@ +-- Create test function to sample +CREATE OR REPLACE FUNCTION test_function_project_set(a int) RETURNS SETOF oid AS +$BODY$ +BEGIN + RETURN QUERY SELECT oid from pg_class where oid = a; +END; +$BODY$ +LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION test_function_result(a int, b text) RETURNS void AS +$BODY$ +BEGIN + INSERT INTO pg_tracing_test(a, b) VALUES (a, b); +END; +$BODY$ +LANGUAGE plpgsql; + + +-- Trace a statement with a function call +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000051-0000000000000051-01'*/ select test_function_project_set(1); + +-- The test function call will generate the following spans (non exhaustive list): +-- +---------------------------------------------------------------------------------------------------+ +-- | A: Select test_function_project_set(1); | +-- +-+--------++----------++----------------++--------------------------------------------------------++ +-- |B: Parse||C: Planner||D: ExecutorStart||E: ExecutorRun | +-- +--------++----------++----------------+++-----------------------------------------------------+-+ +-- |F: ProjectSet | +-- ++---------+---------+------------------------------+-+ +-- |G: Result|H: Parse | I: Select a from b where... | +-- +---------+---------+----+--------------+----------+ +-- |J: ExecutorRun| +-- +--------------+ + +-- Gather span_id, span start and span end of function call statement +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=81 and span_type!='Parse' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_a_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_f_id, + get_span_start_ns(span_start, span_start_ns) as span_f_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_f_end + from pg_tracing_peek_spans where parent_id=:span_e_id and span_type='ProjectSet' \gset +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Result' \gset +SELECT span_id AS span_h_id, + get_span_start_ns(span_start, span_start_ns) as span_h_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_h_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Parse' \gset +SELECT span_id AS span_i_id, + get_span_start_ns(span_start, span_start_ns) as span_i_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_i_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Select' \gset +SELECT span_id AS span_j_id, + get_span_start_ns(span_start, span_start_ns) as span_j_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_j_end + from pg_tracing_peek_spans where parent_id=:span_i_id and span_operation='ExecutorRun' \gset + +-- Check that spans' start and end are within expection +SELECT :span_a_start < :span_e_start AS top_query_before_run, + :span_a_end >= :span_e_end AS top_ends_after_run_end, + + :span_e_start <= :span_f_start AS top_run_starts_before_project, + + :span_e_end >= :span_f_end AS top_run_ends_after_project_end, + :span_e_end >= :span_h_end AS top_run_ends_before_select_end, + :span_e_end >= :span_i_end AS top_run_ends_after_nested_run_end; + +SELECT + :span_g_end >= :span_h_start AS nested_result_ends_before_parse, + :span_h_end <= :span_i_start AS nested_parse_ends_before_select, + + :span_j_start >= :span_i_start AS run_starts_after_parent_select, + :span_j_end <= :span_i_end AS run_ends_after_select_end; + +-- Check that the root span is the longest one +WITH max_duration AS (select max(duration) from pg_tracing_peek_spans) +SELECT duration = max_duration.max from pg_tracing_peek_spans, max_duration + where span_id = :span_a_id; + +-- Check that ExecutorRun is attached to the nested top span +SELECT span_operation, deparse_info from pg_tracing_peek_spans where parent_id=:span_j_id order by span_operation; + +-- Check tracking with top tracking +set pg_tracing.track = 'top'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000052-0000000000000052-01'*/ select test_function_project_set(1); +SELECT count(*) from pg_tracing_consume_spans where trace_id=82; + +-- Check tracking with no tracking +set pg_tracing.track = 'none'; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000053-0000000000000053-01'*/ select test_function_project_set(1); +SELECT count(*) from pg_tracing_consume_spans where trace_id=83; + +-- Reset tracking setting +set pg_tracing.track = 'all'; + +-- Create test procedure +CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ +DECLARE + r int; +BEGIN + SELECT (i + i)::int INTO r; +END; $$ LANGUAGE plpgsql; + +-- Test tracking of procedure with utility tracking enabled +set pg_tracing.track_utility=on; +/*traceparent='00-00000000000000000000000000000054-0000000000000054-01'*/ CALL sum_one(3); +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + + +-- Test again with utility tracking disabled +set pg_tracing.track_utility=off; +/*traceparent='00-00000000000000000000000000000055-0000000000000055-01'*/ CALL sum_one(10); +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Create immutable function +CREATE OR REPLACE FUNCTION test_immutable_function(a int) RETURNS oid +AS 'SELECT oid from pg_class where oid = a;' +LANGUAGE sql IMMUTABLE; + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000056-0000000000000056-01'*/ select test_immutable_function(1); +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Create function with generate series + +CREATE OR REPLACE FUNCTION test_generate_series(IN anyarray, OUT x anyelement) + RETURNS SETOF anyelement + LANGUAGE sql + AS 'select * from pg_catalog.generate_series(array_lower($1, 1), array_upper($1, 1), 1)'; + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000057-0000000000000057-01'*/ select test_generate_series('{1,2,3,4}'::int[]) FROM (VALUES (1,2)); + +SELECT span_id AS span_project_set_id, + get_span_start_ns(span_start, span_start_ns) as span_project_set_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_project_set_end + from pg_tracing_peek_spans where span_type='ProjectSet' \gset + +SELECT span_id AS span_result_id, + get_span_start_ns(span_start, span_start_ns) as span_result_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_result_end + from pg_tracing_peek_spans where parent_id=:span_project_set_id and span_type='Result' \gset + +SELECT span_id AS span_parse, + get_span_start_ns(span_start, span_start_ns) as span_parse_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_parse_end + from pg_tracing_peek_spans where parent_id=:span_project_set_id and span_type='Parse' \gset + +-- Check that spans' start and end are within expection +SELECT :span_project_set_start < :span_parse_start AS project_set_starts_before_parse, + :span_result_start < :span_parse_start AS result_starts_before_parse, + :span_result_end <= :span_parse_start AS result_ends_before_parse, + :span_project_set_end > :span_parse_end AS project_set_ends_after_parse; + +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- +---------------------------------------------------------------------------------------------------+ +-- | A: Select test_function(1); | +-- +-+--------++----------++----------------++--------------------------------------------------------++ +-- |B: Parse||C: Planner||D: ExecutorStart||E: ExecutorRun | +-- +--------++----------++----------------+++-----------------------------------------------------+-+ +-- |F: Result | +-- ++---------------+----------------------------------+-+ +-- |G: Parse | H: Insert INTO... | +-- +---------------+--------+--------------+----------+ +-- |I: ExecutorRun| +-- +--------------+ + +-- Check function with result node +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000058-0000000000000058-01'*/ select test_function_result(1, 'test'); + +-- Gather span_id, span start and span end of function call statement +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=88 and span_type!='Parse' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_a_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_f_id, + get_span_start_ns(span_start, span_start_ns) as span_f_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_f_end + from pg_tracing_peek_spans where parent_id=:span_e_id and span_type='Result' \gset +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Parse' \gset +SELECT span_id AS span_h_id, + get_span_start_ns(span_start, span_start_ns) as span_h_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_h_end + from pg_tracing_peek_spans where parent_id=:span_f_id and span_type='Insert' \gset + +-- Check that parse span is correctly positioned +SELECT :span_g_start >= :span_f_start AS parse_start_after_result, + :span_g_end < :span_f_end AS parse_ends_before_result, + :span_g_end <= :span_h_start AS parse_ends_before_insert_node; + +SELECT span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; diff --git a/contrib/pg_tracing/sql/parallel.sql b/contrib/pg_tracing/sql/parallel.sql new file mode 100644 index 0000000000..216f813252 --- /dev/null +++ b/contrib/pg_tracing/sql/parallel.sql @@ -0,0 +1,26 @@ +begin; +-- encourage use of parallel plans +set local parallel_setup_cost=0; +set local parallel_tuple_cost=0; +set local min_parallel_table_scan_size=0; +set local max_parallel_workers_per_gather=2; + +-- Trace parallel queries +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ select 1 from pg_class limit 1; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ select 2 from pg_class limit 2; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-00'*/ select 3 from pg_class limit 3; +commit; + +-- Worker can take some additional time to end and report their spans +SELECT pg_sleep(0.2); + +-- Get root top span id +SELECT span_id as root_span_id from pg_tracing_peek_spans where span_type='Select' and trace_id=1 and parent_id=1 \gset +-- Get executor top span id +SELECT span_id as executor_span_id from pg_tracing_peek_spans where span_operation='ExecutorRun' and trace_id=1 and parent_id=:root_span_id \gset + +-- Check the select spans that are attached to the root top span +SELECT trace_id, span_type, span_operation from pg_tracing_peek_spans where span_type='Select' and parent_id=:executor_span_id order by span_operation; + +-- Check number of executor spans +SELECT count(*) from pg_tracing_consume_spans where span_type='Executor'; diff --git a/contrib/pg_tracing/sql/reset.sql b/contrib/pg_tracing/sql/reset.sql new file mode 100644 index 0000000000..531db2172d --- /dev/null +++ b/contrib/pg_tracing/sql/reset.sql @@ -0,0 +1,3 @@ +-- Check reset is working +SELECT pg_tracing_reset(); +SELECT traces from pg_tracing_info; diff --git a/contrib/pg_tracing/sql/sample.sql b/contrib/pg_tracing/sql/sample.sql new file mode 100644 index 0000000000..de713cb7a7 --- /dev/null +++ b/contrib/pg_tracing/sql/sample.sql @@ -0,0 +1,41 @@ +-- Trace nothing +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 0.0; + +-- Query with sampling flag +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000021-0000000000000021-01'*/ SELECT 1; +-- Query without trace context +SELECT 1; + +-- No spans should have been generated +select count(distinct(trace_id)) from pg_tracing_consume_spans; + +-- Enable full sampling +SET pg_tracing.sample_rate = 1.0; + +-- Generate queries with sampling flag on, off and no trace context +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000022-0000000000000022-01'*/ SELECT 1; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000023-0000000000000023-00'*/ SELECT 2; +SELECT 3; +SELECT 4; + +-- Check number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; +-- Check span order +select span_operation, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; +-- Top spans should reuse generated ids and have trace_id = parent_id +select span_operation, parameters from pg_tracing_consume_spans where trace_id = parent_id order by span_start, span_start_ns; + +-- Only trace query with sampled flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; + +-- Generate queries with sampling flag on, off and no trace context +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000024-0000000000000024-01'*/ SELECT 1; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000025-0000000000000025-00'*/ SELECT 2; +SELECT 1; + +-- Check number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; +-- Check span ordering +select span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns; diff --git a/contrib/pg_tracing/sql/select.sql b/contrib/pg_tracing/sql/select.sql new file mode 100644 index 0000000000..93be6faa2b --- /dev/null +++ b/contrib/pg_tracing/sql/select.sql @@ -0,0 +1,113 @@ +-- Only trace queries with sample flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; + +-- Run a simple query +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ SELECT 1; + +-- Get top span id +SELECT span_id AS top_span_id from pg_tracing_peek_spans where parent_id=1 and span_type!='Parse' \gset + +-- Check parameters +SELECT parameters from pg_tracing_peek_spans where span_id=:top_span_id; + +-- Check the number of children +SELECT count(*) from pg_tracing_peek_spans where parent_id=:'top_span_id'; + +-- Check span_operation +SELECT span_type, span_operation from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + +-- Check count of query_id +SELECT count(distinct query_id) from pg_tracing_peek_spans where trace_id=1; + +-- Check reported number of trace +SELECT traces from pg_tracing_info; + +-- Trace a statement with function call +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ SELECT count(*) from current_database(); + +-- Check the generated span span_type, span_operation and order of function call +SELECT span_type, span_operation from pg_tracing_consume_spans where trace_id=3 order by span_operation; + +-- Trace a more complex query with multiple function calls +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000004-0000000000000004-01'*/ SELECT s.relation_size + s.index_size +FROM (SELECT + pg_relation_size(C.oid) as relation_size, + pg_indexes_size(C.oid) as index_size + FROM pg_class C) as s limit 1; + +-- Check the generated span span_type, span_operation and order of query with multiple function calls +SELECT span_type, span_operation from pg_tracing_consume_spans where trace_id=4 order by span_operation; + +-- Check that we're in a correct state after a timeout +set statement_timeout=200; + +-- Trace query triggering a statement timeout +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000007-0000000000000007-01'*/ select * from pg_sleep(10); +-- Cleanup statement setting +set statement_timeout=0; + +-- Trace a working query after the timeout to check we're in a consistent state +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000008-0000000000000008-01'*/ select 1; + +-- Check the spans order and error code +SELECT span_type, span_operation, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Create a prepare statement with comment passed as first parameter +PREPARE test_prepared (text, integer) AS /*$1*/ SELECT $2; +-- Execute prepare statement with trace context passed as a parameter +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', 1); + +-- Check generated spans and order +SELECT trace_id, span_type, span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +-- Execute prepare statement with trace context passed as a parameter and generic plan +EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000010-0000000000000010-01''', 10); + +-- Cleanup +SET plan_cache_mode='auto'; +DEALLOCATE test_prepared; + +-- Check spans are generated even through generic plan +SELECT trace_id, span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Run a statement with node not executed +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000a-000000000000000a-01'*/ select 1 limit 0; +-- Not executed node should not generate any spans +SELECT trace_id, span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Test multiple statements in a single query +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000011-0000000000000012-01'*/ select 1; select 2; +-- Not executed node should not generate any spans +select span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns; + +-- Check that parameters are not exported when disabled +SET pg_tracing.export_parameters=false; +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000b-000000000000000b-01'*/ select 1, 2, 3; +SELECT trace_id, span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; +-- Reset export parameters setting +SET pg_tracing.export_parameters=true; + +-- Check the result of disabling deparse +SET pg_tracing.deparse_plan=false; +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000c-000000000000000c-01'*/ SELECT * from pg_tracing_test where a=1; +SET pg_tracing.deparse_plan=true; +/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000d-000000000000000d-01'*/ SELECT * from pg_tracing_test where a=1; +SELECT trace_id, span_operation, deparse_info, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Check multi statement query +SET pg_tracing.sample_rate = 1.0; +-- Force a multi-query statement with \; +SELECT 1\; SELECT 1, 2; +SELECT span_operation, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + + +-- Check standalone trace +SELECT 1; +-- Make sure we have unique span ids +select count(span_id) from pg_tracing_consume_spans group by span_id; + +-- Cleanup +SET pg_tracing.sample_rate = 0.0; diff --git a/contrib/pg_tracing/sql/subxact.sql b/contrib/pg_tracing/sql/subxact.sql new file mode 100644 index 0000000000..8b48f76903 --- /dev/null +++ b/contrib/pg_tracing/sql/subxact.sql @@ -0,0 +1,15 @@ +-- Enable full sampling +SET pg_tracing.sample_rate = 1.0; + +-- Start a transaction with subxaction +BEGIN; +SAVEPOINT s1; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s2; +INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa'); +SAVEPOINT s3; +SELECT 1; +COMMIT; + +-- Check that subxact_count is correctly reported +select span_operation, parameters, subxact_count from pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; diff --git a/contrib/pg_tracing/sql/trigger.sql b/contrib/pg_tracing/sql/trigger.sql new file mode 100644 index 0000000000..445541dabe --- /dev/null +++ b/contrib/pg_tracing/sql/trigger.sql @@ -0,0 +1,204 @@ +-- Only trace queries with sample flag +SET pg_tracing.sample_rate = 0.0; +SET pg_tracing.caller_sample_rate = 1.0; + +-- Create test tables +CREATE TABLE Employee ( + EmployeeId INT NOT NULL, + LastName VARCHAR(20) NOT NULL, + CONSTRAINT PK_Employee PRIMARY KEY (EmployeeId)); +CREATE TABLE Employee_Audit ( + EmployeeId INT NOT NULL, + LastName VARCHAR(20) NOT NULL, + EmpAdditionTime VARCHAR(20) NOT NULL); + +-- Create test trigger +CREATE OR REPLACE FUNCTION employee_insert_trigger_fnc() + RETURNS trigger AS +$$ +BEGIN + INSERT INTO Employee_Audit (EmployeeId, LastName, EmpAdditionTime) + VALUES(NEW.EmployeeId,NEW.LastName,current_date); +RETURN NEW; +END; +$$ +LANGUAGE plpgsql; + +-- Hook the trigger twice +CREATE TRIGGER employee_insert_trigger + AFTER INSERT + ON Employee + FOR EACH ROW + EXECUTE PROCEDURE employee_insert_trigger_fnc(); + +CREATE TRIGGER employee_insert_trigger_2 + AFTER INSERT + ON Employee + FOR EACH ROW + EXECUTE PROCEDURE employee_insert_trigger_fnc(); + + +-- Call update +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO Employee VALUES(10,'Adams'); + + +-- +-- +--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- |A: Parse| B: INSERT INTO Employee | +-- +--------++----------+-+--------------------------+-----------------------------------------------------------------------------------------------------------------+---------------++ +-- |C: Planner| |D: ExecutorRun | G: ExecutorFinish |R: ExecutorEnd | +-- +----------+ ++-----------------------+-+--+--------+-------------------------------------------------------------+-+-------------------------------------+---------------+ +-- |E: Insert into employee| |H: Parse| I: INSERT INTO Employee_Audit... | | O: INSERT INTO Employee_Audit... | +-- ++---------+------------+ +--------+-+----------++----------------------------+---+--------------+ ++---------------+-----+--------------+ +-- |F: Result| |J: Planner||K: ExecutorRun |...|N: ExecutorEnd| |P: ExecutorRun | ... |Q: ExecutorEnd| +-- +---------+ +----------+++---------------------------+ +--------------+ ++--------------+ +--------------+ +-- |L: Insert on employee audit| | ... | +-- +-+---------+---------------+ +--------------+ +-- |M: Result| +-- +---------+ + +-- Gather span_id, span start and span end of call with triggers +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=1 and span_type='Parse' \gset +SELECT span_id AS span_b_id, + get_span_start_ns(span_start, span_start_ns) as span_b_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_b_end + from pg_tracing_peek_spans where parent_id=1 and span_type!='Parse' \gset +SELECT span_id AS span_c_id, + get_span_start_ns(span_start, span_start_ns) as span_c_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_c_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Planner' \gset +SELECT span_id AS span_d_id, + get_span_start_ns(span_start, span_start_ns) as span_d_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_d_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_d_id and span_type='Insert' \gset +SELECT span_id AS span_f_id, + get_span_start_ns(span_start, span_start_ns) as span_f_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_f_end + from pg_tracing_peek_spans where parent_id=:span_e_id and span_type='Result' \gset + +-- Executor Finish, first trigger +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorFinish' \gset +SELECT span_id AS span_h_id, + get_span_start_ns(span_start, span_start_ns) as span_h_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_h_end + from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Parse' \gset +SELECT span_id AS span_i_id, + get_span_start_ns(span_start, span_start_ns) as span_i_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_i_end + from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Insert' limit 1 \gset +SELECT span_id AS span_j_id, + get_span_start_ns(span_start, span_start_ns) as span_j_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_j_end + from pg_tracing_peek_spans where parent_id=:span_i_id and span_type='Planner' \gset +SELECT span_id AS span_k_id, + get_span_start_ns(span_start, span_start_ns) as span_k_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_k_end + from pg_tracing_peek_spans where parent_id=:span_i_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_l_id, + get_span_start_ns(span_start, span_start_ns) as span_l_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_l_end + from pg_tracing_peek_spans where parent_id=:span_k_id and span_type='Insert' \gset +SELECT span_id AS span_m_id, + get_span_start_ns(span_start, span_start_ns) as span_m_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_m_end + from pg_tracing_peek_spans where parent_id=:span_l_id and span_type='Result' \gset +SELECT span_id AS span_n_id, + get_span_start_ns(span_start, span_start_ns) as span_n_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_n_end + from pg_tracing_peek_spans where parent_id=:span_i_id and span_type='Executor' and span_operation='ExecutorEnd' \gset + +-- Executor Finish, second trigger +SELECT span_id AS span_o_id, + get_span_start_ns(span_start, span_start_ns) as span_o_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_o_end + from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Insert' limit 1 offset 1 \gset +SELECT span_id AS span_p_id, + get_span_start_ns(span_start, span_start_ns) as span_p_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_p_end + from pg_tracing_peek_spans where parent_id=:span_o_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_q_id, + get_span_start_ns(span_start, span_start_ns) as span_q_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_q_end + from pg_tracing_peek_spans where parent_id=:span_o_id and span_type='Executor' and span_operation='ExecutorEnd' \gset + +-- End +SELECT span_id AS span_r_id, + get_span_start_ns(span_start, span_start_ns) as span_r_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_r_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorEnd' \gset + +-- Check that spans' start and end are within expection +SELECT +-- Root span + :span_a_end <= :span_b_start as parse_before_root_span, +-- Planner + :span_c_start >= :span_b_start as planner_starts_after_root_span, +-- ExecutorRun + :span_d_start >= :span_c_end as executor_run_starts_after_planner, + :span_d_end <= :span_g_start as executor_run_ends_before_finish, +-- First Insert + :span_e_start >= :span_d_start as insert_starts_after_executor_run, + :span_e_start <= :span_f_start as insert_ends_before_executor_run, + :span_e_end >= :span_f_end as insert_starts_before_child, + :span_e_end <= :span_d_end as insert_ends_after_child, +-- ExecutorFinish + :span_g_start <= :span_i_start as executor_finish_starts_before_child, + :span_g_end >= :span_o_end as executor_finish_ends_after_child, +-- First trigger + :span_h_end <= :span_i_start as parse_ends_before_first_trigger, + :span_i_end >= :span_n_end as first_trigger_ends_after_child, +-- Second trigger + :span_o_start >= :span_i_end as second_trigger_starts_after_first, + :span_o_end >= :span_q_end as second_trigger_ends_after_child, + :span_q_end <= :span_r_start as second_trigger_ends_before_executor_end, + :span_o_end <= :span_r_start as second_trigger_ends_before_root_executor_end, +-- Root ExecutorEnd + :span_r_start >= :span_g_end as executor_end_start_after_executor_finish, + :span_r_end <= :span_b_end as executor_end_ends_before_root; + +-- Check span_operation +SELECT span_type, span_operation from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + +-- Check count of query_id +SELECT count(distinct query_id) from pg_tracing_consume_spans where trace_id=1; + + +-- Second call, parse should not be present anymore +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO Employee VALUES(11,'Adams'); + +SELECT span_id AS span_b_id, + get_span_start_ns(span_start, span_start_ns) as span_b_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_b_end + from pg_tracing_peek_spans where parent_id=1 and span_type!='Parse' \gset +SELECT span_id AS span_d_id, + get_span_start_ns(span_start, span_start_ns) as span_d_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_d_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorRun' \gset +SELECT span_id AS span_g_id, + get_span_start_ns(span_start, span_start_ns) as span_g_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_g_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Executor' and span_operation='ExecutorFinish' \gset + +-- Check we don't have Parse span anymore +SELECT count(*) = 0 from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Parse'; +-- Insert should be attached to the correct parent +SELECT count(*) = 1 from pg_tracing_peek_spans where parent_id=:span_g_id and span_type='Insert' limit 1 \gset + +-- Check that ExecutorRun ends before ExecutorFinish start +SELECT :span_d_end <= :span_g_start as finish_start_after_run; + +-- Check generated spans for the second call with trigger +SELECT span_type, span_operation from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, span_operation; + +-- Check count of query_id +SELECT count(distinct query_id) from pg_tracing_consume_spans where trace_id=1; diff --git a/contrib/pg_tracing/sql/utility.sql b/contrib/pg_tracing/sql/utility.sql new file mode 100644 index 0000000000..05ce7527df --- /dev/null +++ b/contrib/pg_tracing/sql/utility.sql @@ -0,0 +1,144 @@ +-- Some helper functions +CREATE OR REPLACE FUNCTION get_span_start_ns(time_start timestamptz, start_ns smallint) RETURNS bigint AS +$BODY$ +BEGIN + RETURN extract(epoch from time_start) * 1000000000 + start_ns; +END; +$BODY$ +LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION get_span_end_ns(time_start timestamptz, start_ns smallint, duration bigint) RETURNS bigint AS +$BODY$ +BEGIN + RETURN get_span_start_ns(time_start, start_ns) + duration; +END; +$BODY$ +LANGUAGE plpgsql; + +-- Create pg_tracing extension with sampling on +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; + +-- This will create the following spans (non exhaustive list): +-- +-- +-------------------------------------------------------------------------------------+ +-- | A: Utility: Create Extension | +-- +-+-----------------------------------------------------------------------------------+ +-- +----------------------------------------------------------------------------------+ +-- |B: ProcessUtility: Create Extension | +-- +---+-----------------------------------+---+--------------------------------------+ +-- +-----------------------------------+ +-------------------------------------+ +-- |C: Utility: Create Function1 | |E: Utility: Create Function2 | +-- ++----------------------------------+ ++-----------------------------------++ +-- +----------------------------------+ +-----------------------------------+ +-- |D: ProcessUtility: Create Function| |F: ProcessUtility: Create Function2| +-- +----------------------------------+ +-----------------------------------+ + +-- Extract span_ids, start and end of those spans +SELECT span_id AS span_a_id, + get_span_start_ns(span_start, span_start_ns) as span_a_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_a_end + from pg_tracing_peek_spans where parent_id=1 and span_type='Utility' \gset + +SELECT span_id AS span_b_id, + get_span_start_ns(span_start, span_start_ns) as span_b_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_b_end + from pg_tracing_peek_spans where parent_id=:span_a_id and span_type='ProcessUtility' \gset + +SELECT span_id AS span_c_id, + get_span_start_ns(span_start, span_start_ns) as span_c_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_c_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Utility' limit 1 \gset + +SELECT span_id AS span_d_id, + get_span_start_ns(span_start, span_start_ns) as span_d_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_d_end + from pg_tracing_peek_spans where parent_id=:span_c_id and span_type='ProcessUtility' \gset + +SELECT span_id AS span_e_id, + get_span_start_ns(span_start, span_start_ns) as span_e_start, + get_span_end_ns(span_start, span_start_ns, duration) as span_e_end + from pg_tracing_peek_spans where parent_id=:span_b_id and span_type='Utility' limit 1 offset 1 \gset + +-- Check that the start and end of those spans are within expectation +SELECT :span_a_start < :span_b_start AS span_a_start_first, + :span_a_end >= :span_b_end AS span_a_end_last, + + :span_d_end <= :span_c_end AS nested_span_ends_before_parent, + :span_c_end <= :span_e_start AS next_utility_start_after; + +-- Clean current spans +select count(*) from pg_tracing_consume_spans; + +-- +-- Test that no utility is captured with track_utility off +-- + +-- Set utility off +SET pg_tracing.track_utility = off; + +-- Prepare and execute a prepared statement +PREPARE test_prepared_one_param (integer) AS SELECT $1; +EXECUTE test_prepared_one_param(100); + +-- Nothing should be generated +select count(*) from pg_tracing_consume_spans; + +-- Force a query to start from ExecutorRun +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + +-- Again, nothing should be generated +select count(*) from pg_tracing_consume_spans; + +-- +-- Test that no utility is captured with track_utility off +-- + +-- Enable utility tracking and track everything +SET pg_tracing.track_utility = on; +SET pg_tracing.sample_rate = 1.0; + +-- Prepare and execute a prepared statement +PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; +EXECUTE test_prepared_one_param_2(100); + +-- Check the number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; +-- Check that the spans are in correct order +select span_operation, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; +-- Check the top span (standalone top span has trace_id=parent_id) +select span_operation, parameters from pg_tracing_consume_spans where trace_id = parent_id order by span_start, span_start_ns; + +-- Test prepared statement with generic plan +SET plan_cache_mode='force_generic_plan'; +EXECUTE test_prepared_one_param(200); + +-- Check the number of generated spans +select count(distinct(trace_id)) from pg_tracing_peek_spans; +-- Check that the spans are in correct order +select span_operation, parameters from pg_tracing_peek_spans order by span_start, span_start_ns; +-- Check the top span (standalone top span has trace_id=parent_id) +select span_operation, parameters from pg_tracing_consume_spans where trace_id = parent_id order by span_start, span_start_ns; + +-- Second create extension should generate an error that is captured by span +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing; +select span_operation, parameters, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns; + +-- Create test table +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE TABLE pg_tracing_test (a int, b char(20)); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ CREATE INDEX pg_tracing_index ON pg_tracing_test (a); + +-- Check create table and index spans +select trace_id, span_type, span_operation from pg_tracing_consume_spans order by span_start, span_start_ns, duration desc; + +-- Cleanup +SET plan_cache_mode='auto'; + + +CREATE OR REPLACE FUNCTION lazy_function(IN anyarray, OUT x anyelement, OUT n int) + RETURNS SETOF RECORD + LANGUAGE sql STRICT IMMUTABLE PARALLEL SAFE + AS 'select s from pg_catalog.generate_series(1, 1, 1) as g(s)'; + +select lazy_function('{1,2,3}'::int[]); + diff --git a/contrib/pg_tracing/sql/wal.sql b/contrib/pg_tracing/sql/wal.sql new file mode 100644 index 0000000000..c2e2804395 --- /dev/null +++ b/contrib/pg_tracing/sql/wal.sql @@ -0,0 +1,29 @@ +-- Enable wal instrumentation +set pg_tracing.instrument_wal = true; + +-- Generate queries with wal write +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; + +-- Check WAL is generated for the above statements +SELECT trace_id, span_type, span_operation, deparse_info, + wal_records > 0 as wal_records, + wal_bytes > 0 as wal_bytes +FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Redo the same but without wal instrumentation +set pg_tracing.instrument_wal = false; + +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa'); +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7; +/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9; + +-- With wal instrumentation disabled, wal counters should be set to 0 +SELECT trace_id, span_type, span_operation, deparse_info, + wal_records = 0 as wal_records, + wal_bytes = 0 as wal_bytes +FROM pg_tracing_consume_spans order by span_start, span_start_ns, span_operation; + +-- Cleanup +set pg_tracing.instrument_wal = true; diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml index ab7e38b52a..353a090a5b 100644 --- a/doc/src/sgml/contrib.sgml +++ b/doc/src/sgml/contrib.sgml @@ -160,6 +160,7 @@ CREATE EXTENSION extension_name; &pgstatstatements; &pgstattuple; &pgsurgery; + &pgtracing; &pgtrgm; &pgvisibility; &pgwalinspect; diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index 4c63a7e768..1034cc62ea 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -148,6 +148,7 @@ + diff --git a/doc/src/sgml/pgtracing.sgml b/doc/src/sgml/pgtracing.sgml new file mode 100644 index 0000000000..b65b16abb0 --- /dev/null +++ b/doc/src/sgml/pgtracing.sgml @@ -0,0 +1,1035 @@ + + + + pg_tracing — generate server-side spans for distributed tracing + + + pg_tracing + + + + The pg_tracing module provides a way to + generate server-side spans for distributed tracing. + + + + The module must be loaded by adding pg_tracing to + in + postgresql.conf because it requires additional shared memory. + This means that a server restart is needed to add or remove the module. + + + + When pg_tracing is active, it generates spans + on sampled queries. To access these spans, the module provides two views: + pg_tracing_consume_spans and + pg_tracing_peek_spans. + The utility functions pg_tracing_reset and + pg_tracing_info provide ways to read and reset + module's statistics. These are not available globally but + can be enabled for a specific database with + CREATE EXTENSION pg_tracing. + + + + The <structname>pg_tracing_consume_spans</structname>/<structname>pg_tracing_peek_spans</structname> Views + + + The spans generated by the module are made available via a + view named pg_tracing_consume_spans. + This view contains one row for each generated span. + The columns of the view are shown in . + The pg_tracing_peek_spans view + provides the same output as pg_tracing_consume_spans. + pg_tracing_peek_spans doesn't delete read spans. + + + + <structname>pg_tracing_consume_spans</structname> Columns + + + + + Column Type + + + Description + + + + + + + + traceid bigint + + + Trace identifier of the span + + + + + + parent_id bigint + + + Identifier of the span's parent + + + + + + span_id bigint + + + Identifier of the span + + + + + + queryid bigint + + + Hash code to identify identical normalized queries, 0 if query_id is disabled + + + + + + span_type text + + + Type of span (Parse, Planner, Executor...) + + + + + + span_operation text + + + Name of the span's operation + + + + + + span_start timestamp with time zone + + + Start time of the span + + + + + + span_start_ns smallint + + + Nanoseconds part of start time + + + + + + duration bigint + + + Duration of the span in nanoseconds + + + + + + sql_error_code bpchar + + + Error code of the query. 00000 for a succesful query + + + + + + pid integer + + + Pid of the backend process that processed the query + + + + + + subxact_count smallint + + + Active count of backend's subtransaction when the span was created + + + + + + plan_startup_cost double precision + + + Estimated cost before fetching any tuples by the query planner + + + + + + plan_total_cost double precision + + + Estimated total cost by the query planner + + + + + + plan_rows double precision + + + Estimated number of row plan is expected to emit + + + + + + plan_width integer + + + Estimated row width in bytes by the query planner + + + + + + rows bigint + + + Number of tuples processed + + + + + + nloops bigint + + + Number of cycles for this node + + + + + + shared_blks_hit bigint + + + Total number of shared block cache hits by the node + + + + + + shared_blks_read bigint + + + Total number of shared blocks read by the node + + + + + + shared_blks_dirtied bigint + + + Total number of shared blocks dirtied by the node + + + + + + shared_blks_written bigint + + + Total number of shared blocks written by the node + + + + + + local_blks_hit bigint + + + Total number of local block cache hits by the node + + + + + + local_blks_read bigint + + + Total number of local blocks read by the node + + + + + + local_blks_dirtied bigint + + + Total number of local blocks dirtied by the node + + + + + + local_blks_written bigint + + + Total number of local blocks written by the node + + + + + + blk_read_time bigint + + + Time spent reading blocks in milliseconds + + + + + + blk_write_time bigint + + + Time spent writing blocks in milliseconds + + + + + + temp_blks_read bigint + + + Total number of temp blocks read by the node + + + + + + temp_blks_written bigint + + + Total number of temp blocks written by the node + + + + + + temp_blk_read_time double precision + + + Total time the node spent reading temporary file blocks, in + milliseconds (if is enabled, + otherwise zero) + + + + + + temp_blk_write_time double precision + + + Total time the node spent writing temporary file blocks, in + milliseconds (if is enabled, + otherwise zero) + + + + + + wal_records bigint + + + Total number of WAL records generated by the node + + + + + + wal_fpi bigint + + + Total number of WAL full page images generated by the node + + + + + + wal_bytes numeric + + + Total amount of WAL generated by the node in bytes + + + + + + jit_functions bigint + + + Total number of functions JIT-compiled by the node + + + + + + jit_generation_time double precision + + + Total time spent by the node on generating JIT code, in milliseconds + + + + + + jit_inlining_time double precision + + + Total time spent by the node on inlining functions, in milliseconds + + + + + + jit_optimization_time double precision + + + Total time spent by the node on optimizing, in milliseconds + + + + + + startup bigint + + + Time to the first tuple in nanoseconds + + + + + + parameters text + + + Value of the query's parameters + + + + + + deparse_info text + + + Informations extracted from deparsing a plan node + + + + + +
+ + + For security reasons, only superusers and roles with privileges of the + pg_read_all_stats role are allowed to access the + pg_tracing_peek_spans and + pg_tracing_consume_spans. + + +
+ + + The <structname>pg_tracing_info</structname> View + + + pg_tracing_info + + + + The statistics of the pg_tracing module + itself are tracked and made available via a view named + pg_tracing_info. This view contains + only a single row. The columns of the view are shown in + . + + + + <structname>pg_tracing_info</structname> Columns + + + + + Column Type + + + Description + + + + + + + + traces bigint + + + Total number traces captured by the module + + + + + + spans bigint + + + Total number spans captured by the module + + + + + + dropped_spans bigint + + + Total number of spans dropped because more than + pg_tracing.max_span were stored + + + + + + failed_truncates bigint + + + Total number of times the module couldn't truncate the + query file due to a conflicting lock + + + + + + last_consume timestamp with time zone + + + Time at which spans were consumed + + + + + + stats_reset timestamp with time zone + + + Time at which all statistics in the + pg_tracing_info view were last reset. + + + + + +
+
+ + + Functions + + + + + pg_tracing_reset() returns void + + pg_tracing_reset + + + + + + pg_tracing_reset discards all statistics + gathered so far by pg_tracing. + Span buffer is not emptied by this function. + By default, this function can only be executed by superusers. + Access may be granted to others using GRANT. + + + + + + + pg_tracing_spans(consume boolean) returns setof record + + pg_tracing + function + + + + + + The pg_tracing_consume_spans and + pg_tracing_peek_spans views are defined in + terms of a function named pg_tracing_spans. + It is possible for clients to call the + pg_tracing_spans function directly. By + specifying consume := false, spans won't be removed + from the span buffer. With consume := true, + read spans will be deleted. + + + + + + + + Configuration Parameters + + + + + pg_tracing.max_span (integer) + + pg_tracing.max_span configuration parameter + + + + + + pg_tracing.max_span is the maximum number of + spans stored by the module (i.e., the maximum number of rows + returned by the pg_tracing_spans function). + If more spans are observed, the span buffer will be emptied if + pg_tracing.drop_on_full_buffer is true. If + pg_tracing.drop_on_full_buffer if false, the spans + will be dropped and tracing will be aborted. + The default value is 5000. This parameter can only be set at server start. + + + + + + + pg_tracing.drop_on_full_buffer (boolean) + + pg_tracing.drop_on_full_buffer configuration parameter + + + + + + pg_tracing.drop_on_full_buffer controls whether + span buffer should be emptied when pg_tracing.max_span + spans is reached. If off, new spans are dropped. + The default value is on. + + + + + + + pg_tracing.max_parameter_size (integer) + + pg_tracing.max_parameter_size configuration parameter + + + + + + pg_tracing.max_parameter_size controls the maximum + size of the parameter string. + The default value is 1024. + + + + + + + pg_tracing.instrument_buffers (boolean) + + pg_tracing.instrument_buffers configuration parameter + + + + + + pg_tracing.instrument_buffers controls whether + buffer instrumentation should be enable and exported in span metadata. + The default value is on. + + + + + + + pg_tracing.instrument_wal (boolean) + + pg_tracing.instrument_wal configuration parameter + + + + + + pg_tracing.instrument_wal controls whether + WAL instrumentation should be enable and exported in span metadata. + The default value is on. + + + + + + + pg_tracing.max_traced_parallel_workers (integer) + + pg_tracing.max_traced_parallel_workers configuration parameter + + + + + + pg_tracing.max_traced_parallel_workers controls the maximum + number of simultaneous parallel workers that can be traced. + The default value is 100. + Only superusers can change this setting. + This parameter can only be set at server start. + + + + + + + pg_tracing.initial_allocated_spans (integer) + + pg_tracing.initial_allocated_spans configuration parameter + + + + + + pg_tracing.initial_allocated_spans controls the number + of allocated spans at the start of a trace. + The default value is 25. + + + + + + + pg_tracing.track (enum) + + pg_tracing.track configuration parameter + + + + + + pg_tracing.track controls which statements + should spans should be generated. + Specify top to only generate span for top-level + statements (those issued directly by clients), all + to also generate spans for nested statements (such as statements invoked + within functions), or none to disable span generation. + The default value is all. + + + + + + + pg_tracing.track_utility (boolean) + + pg_tracing.track_utility configuration parameter + + + + + + pg_tracing.track_utility controls whether spans + should be generated for utility statements. Utility commands are + all those other than SELECT, INSERT, + UPDATE, DELETE, and MERGE. + The default value is on. + + + + + + + pg_tracing.sample_rate (real) + + pg_tracing.sample_rate configuration parameter + + + + + + pg_tracing.sample_rate controls the fraction of + statements that generate spans. Statements with tracecontext propagated + with SQLCommenter and sampled flag enabled are not impacted by this parameter. + In case of nested statements, either all will be explained or none. + The default value is 0. + + + + + + + pg_tracing.caller_sample_rate (real) + + pg_tracing.caller_sample_rate configuration parameter + + + + + + pg_tracing.caller_sample_rate controls the fraction of + statements with with tracecontext propagated with SQLCommenter and an + enabled sampled flag that will generate spans. + The default value is 1. + + + + + + + pg_tracing.export_parameters (boolean) + + pg_tracing.export_parameters configuration parameter + + + + + + pg_tracing.export_parameters controls whether + query's parameters should be exported in spans metadata. + The default value is on. + + + + + + + pg_tracing.deparse_plan (boolean) + + pg_tracing.deparse_plan configuration parameter + + + + + + pg_tracing.deparse_plan controls whether + the query plan should be deparsed. Deparsing plan allows to + add more details in span's name. A + BitmapHeapScan on pg_tracing_test node will + become BitmapHeapScan on pg_tracing_test|Recheck Cond: (a = 1) + with deparsing information. + The default value is on. + + + + + + + pg_tracing.notify_channel (text) + + pg_tracing.notify_channel configuration parameter + + + + + + pg_tracing.notify_channel defines a channel + that will receive a notification when the span buffer exceeds + the pg_tracing.notify_threshold threshold. + No channel is defined by default. + + + + + + + pg_tracing.notify_threshold (real) + + pg_tracing.notify_threshold configuration parameter + + + + + + pg_tracing.notify_threshold defines the + threhold for sending notification to the pg_tracing.notify_channel + channel. + The default value is 0.8: a notification will be sent when span buffer + crosses the 80% of usage. + + + + + + + The module requires additional shared memory proportional to + pg_tracing.max_span. Note that this + memory is consumed whenever the module is loaded, even if + no spans are generated. + + + + These parameters must be set in postgresql.conf. + Typical usage might be: + + +# postgresql.conf +shared_preload_libraries = 'pg_tracing' + +compute_query_id = on +pg_tracing.span_max = 10000 +pg_tracing.track = all + + + + + + Implementation Details + + + SQLCommenter + + A query with SQLcommenter can propagate a trace context to the database. It will have the following format: /*dddbs='postgres.db',traceparent='00-00000000000000000000000000000009-0000000000000005-01'*/ select 1;. The traceparent fields are detailed in w3c's trace-context + + + + 00000000000000000000000000000009: trace id + + + + + 0000000000000005: parent id + + + + + 01: trace flags (01 == sampled) + + + + When a query with SQLCommenter is detected, the trace context is extracted and used by pg_tracing. + + + + + Sampling Decision + + Spans will only be generated for sampled queries. A query is sampled if: + + + + It has a trace context propagated throught SQLCommenter with the sampled flag enabled and it passes the pg_tracing.caller_sample_rate + + + + + It has no SQLCommenter but the query randomly passes the global pg_tracing.sample_rate + + + + By default, pg_tracing.caller_sample_rate = 1 and pg_tracing.sample_rate = 0. + Only queries with a trace context and a sampled := 01 will be sampled, effectively leaving sampling decision to the callers. + + + + + Parse Span + + Parse spans generated by pg_tracing are an approximation. + There's no parse hook available, only post parse hook is available. + The end of the parse is estimated at the start of the post parse. + For statement at the root level, the statement start is used as parse's start. + For nested statements, the start of the new nested level is used. + + + + + Span Time Precision + + The start time of a span has two parts: span_start and + span_start_ns. span_start is a Timestamp + with time zone providing up a microsecond precision. span_start_ns + provides the nanosecond part of the span start. span_start + span_start_ns + can be used to get the start with nanosecond precision. + + + + + Query error + + On query error (statement timeout reached, query canceled...), a sampled + query will still generate spans. The planstate will be partial so + the generated spans only reflect nodes that were executed and it's + possible that some part of the plan will be missing. + + + + + Overhead + + Sampling a query will enable the same type of query instrumentation as + EXPLAIN ANALYZE and will have the same measurement overhead. + Machines with slow gettimeofday() operating-system calls + will have higher overhead. You can use the tool + to measure the overhead of timing on your system. + + + + + + + Sample Output + + +=# SET pg_tracing.sample_rate = 1.0; + +=# SELECT 1; + +=# select trace_id, parent_id, span_id, span_start, duration, span_type, span_operation from pg_tracing_consume_spans; + trace_id | parent_id | span_id | span_start | duration | span_type | span_operation +----------------------+----------------------+----------------------+-------------------------------+----------+----------------+----------------------------------- + 479062278978111363 | 479062278978111363 | 479062278978111363 | 2023-10-12 07:41:13.938561+00 | 555583 | Select | select $1; + 479062278978111363 | 479062278978111363 | 479062278978111363 | 2023-10-12 07:41:13.938036+00 | 525875 | Parse | Parse + 479062278978111363 | 479062278978111363 | -7732425595282414585 | 2023-10-12 07:41:13.938561+00 | 20083 | Post Parse | Post Parse + 479062278978111363 | 479062278978111363 | 3166770036980751499 | 2023-10-12 07:41:13.938592+00 | 397250 | Planner | Planner + 479062278978111363 | 479062278978111363 | -672650609503721668 | 2023-10-12 07:41:13.939001+00 | 59541 | Executor | ExecutorStart + 479062278978111363 | 479062278978111363 | -7168301816080933795 | 2023-10-12 07:41:13.939064+00 | 21750 | Executor | ExecutorRun + 479062278978111363 | 479062278978111363 | 2947496540612753771 | 2023-10-12 07:41:13.939087+00 | 1333 | Executor | ExecutorFinish + 479062278978111363 | 479062278978111363 | -4760927743519507589 | 2023-10-12 07:41:13.939089+00 | 28333 | Executor | ExecutorEnd + 479062278978111363 | -7168301816080933795 | 2075274914194306348 | 2023-10-12 07:41:13.939071+00 | 5332 | Result | Result +(9 rows) + +=# select trace_id, parent_id, span_id, span_start, duration, name, resource from pg_tracing_consume_spans; + trace_id | parent_id | span_id | span_start | duration | name | resource +----------+-----------+---------+------------+----------+------+---------- +(0 rows) + + + + + + Authors + + + Anthonin Bonnefoy anthonin.bonnefoy@datadoghq.com. + + + +
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index c383f34c06..f4b6b5f428 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -67,9 +67,11 @@ InstrInit(Instrumentation *instr, int instrument_options) void InstrStartNode(Instrumentation *instr) { - if (instr->need_timer && - !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) - elog(ERROR, "InstrStartNode called twice in a row"); + if (instr->need_timer) { + if (!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) + elog(ERROR, "InstrStartNode called twice in a row"); + INSTR_TIME_SET_CURRENT_LAZY(instr->firsttime); + } /* save buffer usage totals at node entry, if needed */ if (instr->need_bufusage) diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index d5d69941c5..64098b61ed 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -74,6 +74,7 @@ typedef struct Instrumentation bool async_mode; /* true if node is in async mode */ /* Info about current plan cycle: */ bool running; /* true if we've completed first tuple */ + instr_time firsttime; /* start time of this node */ instr_time starttime; /* start time of current iteration of node */ instr_time counter; /* accumulated runtime for this node */ double firsttuple; /* time for first tuple of this cycle */ -- 2.39.3 (Apple Git-145)