From d904454f4a54a3793dcbd8812a0cbfc821fb979f Mon Sep 17 00:00:00 2001
From: Anthonin Bonnefoy <anthonin.bonnefoy@datadoghq.com>
Date: Mon, 15 May 2023 10:00:08 +0200
Subject: [PATCH] Add pg_tracing extension to contrib

This extension add the possibility to create server-side
spans for distributed tracing.

When a query is sampled, the extension will create spans
tracking the process of this query: Parse, Planner, Executor...
During ExecutorEnd, the planstate is also converted in spans,
effectively generating an explain analyze in a trace format.

The spans are exposed through a function call. Sending spans to a trace
collector is outside this extension's scope.
---
 contrib/Makefile                             |    1 +
 contrib/meson.build                          |    1 +
 contrib/pg_tracing/Makefile                  |   27 +
 contrib/pg_tracing/expected/cleanup.out      |    4 +
 contrib/pg_tracing/expected/create_table.out |    1 +
 contrib/pg_tracing/expected/nested.out       |  145 ++
 contrib/pg_tracing/expected/parallel.out     |   82 +
 contrib/pg_tracing/expected/reset.out        |   13 +
 contrib/pg_tracing/expected/sample.out       |  156 ++
 contrib/pg_tracing/expected/select.out       |  229 ++
 contrib/pg_tracing/expected/subxact.out      |   75 +
 contrib/pg_tracing/expected/utility.out      |  214 ++
 contrib/pg_tracing/expected/wal.out          |   96 +
 contrib/pg_tracing/meson.build               |   52 +
 contrib/pg_tracing/pg_tracing--1.0.sql       |   99 +
 contrib/pg_tracing/pg_tracing.c              | 2313 ++++++++++++++++++
 contrib/pg_tracing/pg_tracing.conf           |    1 +
 contrib/pg_tracing/pg_tracing.control        |    5 +
 contrib/pg_tracing/pg_tracing.h              |   43 +
 contrib/pg_tracing/pg_tracing_explain.c      |  527 ++++
 contrib/pg_tracing/pg_tracing_explain.h      |   20 +
 contrib/pg_tracing/query_process.c           |  473 ++++
 contrib/pg_tracing/query_process.h           |   30 +
 contrib/pg_tracing/span.c                    |  295 +++
 contrib/pg_tracing/span.h                    |  159 ++
 contrib/pg_tracing/sql/cleanup.sql           |    4 +
 contrib/pg_tracing/sql/nested.sql            |  102 +
 contrib/pg_tracing/sql/parallel.sql          |   16 +
 contrib/pg_tracing/sql/reset.sql             |    3 +
 contrib/pg_tracing/sql/sample.sql            |   41 +
 contrib/pg_tracing/sql/select.sql            |   81 +
 contrib/pg_tracing/sql/subxact.sql           |   18 +
 contrib/pg_tracing/sql/utility.sql           |  128 +
 contrib/pg_tracing/sql/wal.sql               |   32 +
 src/backend/executor/instrument.c            |    1 +
 src/include/executor/instrument.h            |    1 +
 36 files changed, 5488 insertions(+)
 create mode 100644 contrib/pg_tracing/Makefile
 create mode 100644 contrib/pg_tracing/expected/cleanup.out
 create mode 100644 contrib/pg_tracing/expected/create_table.out
 create mode 100644 contrib/pg_tracing/expected/nested.out
 create mode 100644 contrib/pg_tracing/expected/parallel.out
 create mode 100644 contrib/pg_tracing/expected/reset.out
 create mode 100644 contrib/pg_tracing/expected/sample.out
 create mode 100644 contrib/pg_tracing/expected/select.out
 create mode 100644 contrib/pg_tracing/expected/subxact.out
 create mode 100644 contrib/pg_tracing/expected/utility.out
 create mode 100644 contrib/pg_tracing/expected/wal.out
 create mode 100644 contrib/pg_tracing/meson.build
 create mode 100644 contrib/pg_tracing/pg_tracing--1.0.sql
 create mode 100644 contrib/pg_tracing/pg_tracing.c
 create mode 100644 contrib/pg_tracing/pg_tracing.conf
 create mode 100644 contrib/pg_tracing/pg_tracing.control
 create mode 100644 contrib/pg_tracing/pg_tracing.h
 create mode 100644 contrib/pg_tracing/pg_tracing_explain.c
 create mode 100644 contrib/pg_tracing/pg_tracing_explain.h
 create mode 100644 contrib/pg_tracing/query_process.c
 create mode 100644 contrib/pg_tracing/query_process.h
 create mode 100644 contrib/pg_tracing/span.c
 create mode 100644 contrib/pg_tracing/span.h
 create mode 100644 contrib/pg_tracing/sql/cleanup.sql
 create mode 100644 contrib/pg_tracing/sql/nested.sql
 create mode 100644 contrib/pg_tracing/sql/parallel.sql
 create mode 100644 contrib/pg_tracing/sql/reset.sql
 create mode 100644 contrib/pg_tracing/sql/sample.sql
 create mode 100644 contrib/pg_tracing/sql/select.sql
 create mode 100644 contrib/pg_tracing/sql/subxact.sql
 create mode 100644 contrib/pg_tracing/sql/utility.sql
 create mode 100644 contrib/pg_tracing/sql/wal.sql

diff --git a/contrib/Makefile b/contrib/Makefile
index da4e2316a3..721a58fcde 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -36,6 +36,7 @@ SUBDIRS = \
 		pg_prewarm	\
 		pg_stat_statements \
 		pg_surgery	\
+		pg_tracing		\
 		pg_trgm		\
 		pgrowlocks	\
 		pgstattuple	\
diff --git a/contrib/meson.build b/contrib/meson.build
index 84d4e18561..f73caf95f5 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -47,6 +47,7 @@ subdir('pgrowlocks')
 subdir('pg_stat_statements')
 subdir('pgstattuple')
 subdir('pg_surgery')
+subdir('pg_tracing')
 subdir('pg_trgm')
 subdir('pg_visibility')
 subdir('pg_walinspect')
diff --git a/contrib/pg_tracing/Makefile b/contrib/pg_tracing/Makefile
new file mode 100644
index 0000000000..7c169da228
--- /dev/null
+++ b/contrib/pg_tracing/Makefile
@@ -0,0 +1,27 @@
+# contrib/pg_tracing/Makefile
+
+MODULE_big = pg_tracing
+OBJS = \
+	$(WIN32RES) \
+	pg_tracing.o \
+	query_process.o \
+	pg_tracing_explain.o \
+	span.o
+
+EXTENSION = pg_tracing
+DATA = pg_tracing--1.0.sql
+PGFILEDESC = "pg_tracing - Distributed tracing for postgres"
+
+REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_tracing/pg_tracing.conf
+REGRESS = utility select sample subxact reset nested wal parallel cleanup
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_tracing
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_tracing/expected/cleanup.out b/contrib/pg_tracing/expected/cleanup.out
new file mode 100644
index 0000000000..beb88da53f
--- /dev/null
+++ b/contrib/pg_tracing/expected/cleanup.out
@@ -0,0 +1,4 @@
+-- Drop test table and test functions
+DROP TABLE pg_tracing_test;
+DROP function test_function;
+DROP EXTENSION pg_tracing;
diff --git a/contrib/pg_tracing/expected/create_table.out b/contrib/pg_tracing/expected/create_table.out
new file mode 100644
index 0000000000..87875a113f
--- /dev/null
+++ b/contrib/pg_tracing/expected/create_table.out
@@ -0,0 +1 @@
+CREATE TABLE pg_tracing_test (a int, b char(20));
diff --git a/contrib/pg_tracing/expected/nested.out b/contrib/pg_tracing/expected/nested.out
new file mode 100644
index 0000000000..ab0cd0c5c6
--- /dev/null
+++ b/contrib/pg_tracing/expected/nested.out
@@ -0,0 +1,145 @@
+-- Create test function to sample
+CREATE OR REPLACE FUNCTION test_function(a int) RETURNS SETOF oid AS
+$BODY$
+BEGIN
+	RETURN QUERY SELECT oid from pg_class where oid = a;
+END;
+$BODY$
+LANGUAGE plpgsql;
+-- Trace a statement with a function call
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000051-0000000000000051-01'*/ select test_function(1);
+ test_function 
+---------------
+(0 rows)
+
+-- The test function call will generate the following spans (non exhaustive list):
+-- +-----------------------------------------------------------------------------------------------+
+-- | A: Select test_function(1);                                                                   |
+-- +-+--------++----------++----------------++---------------------------------------------------+-+
+--   |B: Parse||C: Planner||D: ExecutorStart||E: ExecutorRun                                     |
+--   +--------++----------++----------------+++-------------------------------------------------++
+--                                            |F: ProjectSet                                    |
+--                                            ++---------+------+------------------------------++
+--                                             |G: Result|      | H: Select a from b where...  |
+--                                             +---------+      +----+--------------+----------+
+--                                                                   |I: ExecutorRun|
+--                                                                   +--------------+
+-- Gather span_id, span start and span end of function call statement
+SELECT span_id AS span_a_id,
+        get_span_start_ns(span_start, span_start_ns) as span_a_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_a_end
+		from pg_tracing_peek_spans where parent_id=81 and name!='Parse' \gset
+SELECT span_id AS span_e_id,
+        get_span_start_ns(span_start, span_start_ns) as span_e_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_e_end
+		from pg_tracing_peek_spans where parent_id=:span_a_id and name='Executor' and resource='Run' \gset
+SELECT span_id AS span_f_id,
+        get_span_start_ns(span_start, span_start_ns) as span_f_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_f_end
+		from pg_tracing_peek_spans where parent_id=:span_e_id and name='ProjectSet' \gset
+SELECT span_id AS span_g_id,
+        get_span_start_ns(span_start, span_start_ns) as span_g_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_g_end
+		from pg_tracing_peek_spans where parent_id=:span_f_id and name='Result' \gset
+SELECT span_id AS span_h_id,
+        get_span_start_ns(span_start, span_start_ns) as span_h_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_h_end
+		from pg_tracing_peek_spans where parent_id=:span_f_id and name='Select' \gset
+SELECT span_id AS span_i_id,
+        get_span_start_ns(span_start, span_start_ns) as span_i_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_i_end
+		from pg_tracing_peek_spans where parent_id=:span_h_id and resource='Run' \gset
+-- Check that spans' start and end are within expection
+SELECT :span_a_start < :span_e_start AS top_query_before_run,
+		:span_a_end >= :span_e_end AS top_end_after_run_end,
+		:span_e_start <= :span_f_start AS top_run_start_before_project,
+		:span_e_end >= :span_f_end AS top_run_end_after_project_end,
+		:span_e_end >= :span_h_end AS top_run_end_before_select_end,
+		:span_e_end >= :span_i_end AS top_run_end_after_nested_run_end,
+		:span_i_end <= :span_h_end AS run_end_after_select_end;
+ top_query_before_run | top_end_after_run_end | top_run_start_before_project | top_run_end_after_project_end | top_run_end_before_select_end | top_run_end_after_nested_run_end | run_end_after_select_end 
+----------------------+-----------------------+------------------------------+-------------------------------+-------------------------------+----------------------------------+--------------------------
+ t                    | t                     | t                            | t                             | t                             | t                                | t
+(1 row)
+
+-- Check that the root span is the longest one
+WITH max_duration AS (select max(duration) from pg_tracing_peek_spans)
+SELECT duration = max_duration.max from pg_tracing_peek_spans, max_duration
+    where span_id = :span_a_id;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- Check that ExecutorRun is attached to the nested top span
+SELECT resource from pg_tracing_peek_spans where parent_id=:span_i_id order by resource;
+                                    resource                                     
+---------------------------------------------------------------------------------
+ IndexOnlyScan using pg_class_oid_index on pg_class|Index Cond: (oid = '1'::oid)
+(1 row)
+
+-- Check tracking with top tracking
+set pg_tracing.track = 'top';
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000052-0000000000000052-01'*/ select test_function(1);
+ test_function 
+---------------
+(0 rows)
+
+SELECT count(*) from pg_tracing_consume_spans where trace_id=82;
+ count 
+-------
+    10
+(1 row)
+
+-- Check tracking with no tracking
+set pg_tracing.track = 'none';
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000053-0000000000000053-01'*/ select test_function(1);
+ test_function 
+---------------
+(0 rows)
+
+SELECT count(*) from pg_tracing_consume_spans where trace_id=83;
+ count 
+-------
+     0
+(1 row)
+
+-- Reset tracking setting
+set pg_tracing.track = 'all';
+-- Create test procedure
+CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
+DECLARE
+  r int;
+BEGIN
+  SELECT (i + i)::int INTO r;
+END; $$ LANGUAGE plpgsql;
+-- Test tracking of procedure with utility tracking enabled
+set pg_tracing.track_utility=on;
+/*traceparent='00-00000000000000000000000000000054-0000000000000054-01'*/ CALL sum_one(3);
+SELECT resource from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+      resource       
+---------------------
+ CALL sum_one(3);
+ Parse
+ Post Parse
+ CALL sum_one(3);
+ Parse
+ SELECT (i + i)::int
+ Post Parse
+ Planner
+ Planner
+ Start
+ Run
+ Result
+ Finish
+ End
+(14 rows)
+
+-- Test again with utility tracking disabled
+set pg_tracing.track_utility=off;
+/*traceparent='00-00000000000000000000000000000055-0000000000000055-01'*/ CALL sum_one(10);
+SELECT resource from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+ resource 
+----------
+(0 rows)
+
diff --git a/contrib/pg_tracing/expected/parallel.out b/contrib/pg_tracing/expected/parallel.out
new file mode 100644
index 0000000000..9906a3e861
--- /dev/null
+++ b/contrib/pg_tracing/expected/parallel.out
@@ -0,0 +1,82 @@
+begin;
+-- encourage use of parallel plans
+set parallel_setup_cost=0;
+set parallel_tuple_cost=0;
+set min_parallel_table_scan_size=0;
+set max_parallel_workers_per_gather=2;
+-- Trace parallel queries
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ select 1 from pg_class limit 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ select 2 from pg_class limit 2;
+ ?column? 
+----------
+        2
+        2
+(2 rows)
+
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-00'*/ select 3 from pg_class limit 3;
+ ?column? 
+----------
+        3
+        3
+        3
+(3 rows)
+
+commit;
+-- Check order and number of spans generated by parallel queries
+-- We can't order by span start due to the parallelisation introducing randomness
+SELECT name, resource from pg_tracing_consume_spans order by resource;
+    name    |             resource              
+------------+-----------------------------------
+ Executor   | End
+ Executor   | End
+ Executor   | End
+ Executor   | End
+ Executor   | End
+ Executor   | End
+ Executor   | Finish
+ Executor   | Finish
+ Executor   | Finish
+ Executor   | Finish
+ Executor   | Finish
+ Executor   | Finish
+ Gather     | Gather
+ Gather     | Gather
+ Limit      | Limit
+ Limit      | Limit
+ SeqScan    | Parallel SeqScan on pg_class
+ SeqScan    | Parallel SeqScan on pg_class
+ SeqScan    | Parallel SeqScan on pg_class
+ SeqScan    | Parallel SeqScan on pg_class
+ SeqScan    | Parallel SeqScan on pg_class
+ SeqScan    | Parallel SeqScan on pg_class
+ Parse      | Parse
+ Parse      | Parse
+ Planner    | Planner
+ Planner    | Planner
+ Post Parse | Post Parse
+ Post Parse | Post Parse
+ Executor   | Run
+ Executor   | Run
+ Executor   | Run
+ Executor   | Run
+ Executor   | Run
+ Executor   | Run
+ Executor   | Start
+ Executor   | Start
+ Executor   | Start
+ Executor   | Start
+ Executor   | Start
+ Executor   | Start
+ Select     | Worker 0
+ Select     | Worker 0
+ Select     | Worker 1
+ Select     | Worker 1
+ Select     | select $1 from pg_class limit $2;
+ Select     | select $1 from pg_class limit $2;
+(46 rows)
+
diff --git a/contrib/pg_tracing/expected/reset.out b/contrib/pg_tracing/expected/reset.out
new file mode 100644
index 0000000000..2e0c903121
--- /dev/null
+++ b/contrib/pg_tracing/expected/reset.out
@@ -0,0 +1,13 @@
+-- Check reset is working
+SELECT pg_tracing_reset();
+ pg_tracing_reset 
+------------------
+ 
+(1 row)
+
+SELECT traces from pg_tracing_info;
+ traces 
+--------
+      0
+(1 row)
+
diff --git a/contrib/pg_tracing/expected/sample.out b/contrib/pg_tracing/expected/sample.out
new file mode 100644
index 0000000000..c1161e0537
--- /dev/null
+++ b/contrib/pg_tracing/expected/sample.out
@@ -0,0 +1,156 @@
+-- Trace nothing
+SET pg_tracing.sample_rate = 0.0;
+SET pg_tracing.caller_sample_rate = 0.0;
+-- Query with sampling flag
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000021-0000000000000021-01'*/ SELECT 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+-- Query without trace context
+SELECT 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+-- No spans should have been generated
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+ count 
+-------
+     0
+(1 row)
+
+-- Enable full sampling
+SET pg_tracing.sample_rate = 1.0;
+-- Generate queries with sampling flag on, off and no trace context
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000022-0000000000000022-01'*/ SELECT 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000023-0000000000000023-00'*/ SELECT 2;
+ ?column? 
+----------
+        2
+(1 row)
+
+SELECT 3;
+ ?column? 
+----------
+        3
+(1 row)
+
+SELECT 4;
+ ?column? 
+----------
+        4
+(1 row)
+
+-- Check number of generated spans
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+ count 
+-------
+     4
+(1 row)
+
+-- Check span order
+select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns;
+  resource  | parameters 
+------------+------------
+ SELECT $1; | $1 = 1
+ Parse      | 
+ Post Parse | 
+ Planner    | 
+ Start      | 
+ Run        | 
+ Result     | 
+ Finish     | 
+ End        | 
+ SELECT $1; | $1 = 2
+ Parse      | 
+ Post Parse | 
+ Planner    | 
+ Start      | 
+ Run        | 
+ Result     | 
+ Finish     | 
+ End        | 
+ SELECT $1; | $1 = 3
+ Parse      | 
+ Post Parse | 
+ Planner    | 
+ Start      | 
+ Run        | 
+ Result     | 
+ Finish     | 
+ End        | 
+ Parse      | 
+ SELECT $1; | $1 = 4
+ Post Parse | 
+ Planner    | 
+ Start      | 
+ Run        | 
+ Result     | 
+ Finish     | 
+ End        | 
+(36 rows)
+
+-- Top spans should reuse generated ids and have trace_id = parent_id = span_id
+select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns;
+  resource  | parameters 
+------------+------------
+ SELECT $1; | $1 = 3
+ SELECT $1; | $1 = 4
+(2 rows)
+
+-- Only trace query with sampled flag
+SET pg_tracing.sample_rate = 0.0;
+SET pg_tracing.caller_sample_rate = 1.0;
+-- Generate queries with sampling flag on, off and no trace context
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000024-0000000000000024-01'*/ SELECT 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000025-0000000000000025-00'*/ SELECT 2;
+ ?column? 
+----------
+        2
+(1 row)
+
+SELECT 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+-- Check number of generated spans
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+ count 
+-------
+     2
+(1 row)
+
+-- Check span ordering
+select resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns;
+             resource              | parameters 
+-----------------------------------+------------
+ SET pg_tracing.sample_rate = 0.0; | 
+ Parse                             | 
+ Post Parse                        | 
+ SET pg_tracing.sample_rate = 0.0; | 
+ SELECT $1;                        | $1 = 1
+ Parse                             | 
+ Post Parse                        | 
+ Planner                           | 
+ Start                             | 
+ Run                               | 
+ Result                            | 
+ Finish                            | 
+ End                               | 
+(13 rows)
+
diff --git a/contrib/pg_tracing/expected/select.out b/contrib/pg_tracing/expected/select.out
new file mode 100644
index 0000000000..9f52cf8d2a
--- /dev/null
+++ b/contrib/pg_tracing/expected/select.out
@@ -0,0 +1,229 @@
+-- Only trace queries with sample flag
+SET pg_tracing.sample_rate = 0.0;
+SET pg_tracing.caller_sample_rate = 1.0;
+-- Run a simple query
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ SELECT 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+-- Get top span id
+SELECT span_id AS top_span_id from pg_tracing_peek_spans where parent_id=1 and name!='Parse' \gset
+-- Check parameters
+SELECT parameters from pg_tracing_peek_spans where span_id=:top_span_id;
+ parameters 
+------------
+ $1 = 1
+(1 row)
+
+-- Check the number of children
+SELECT count(*) from pg_tracing_peek_spans where parent_id=:'top_span_id';
+ count 
+-------
+     7
+(1 row)
+
+-- Check resource and query id
+SELECT name, resource, query_id from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, resource;
+    name    |  resource  |      query_id       
+------------+------------+---------------------
+ Parse      | Parse      | 6865378226349601843
+ Select     | SELECT $1; | 6865378226349601843
+ Post Parse | Post Parse | 6865378226349601843
+ Planner    | Planner    | 6865378226349601843
+ Executor   | Start      | 6865378226349601843
+ Executor   | Run        | 6865378226349601843
+ Result     | Result     | 6865378226349601843
+ Executor   | Finish     | 6865378226349601843
+ Executor   | End        | 6865378226349601843
+(9 rows)
+
+-- Check reported number of trace
+SELECT traces from pg_tracing_info;
+ traces 
+--------
+     15
+(1 row)
+
+-- Trace a statement with function call
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ SELECT count(*) from current_database();
+ count 
+-------
+     1
+(1 row)
+
+-- Check the generated span name, resource and order of function call
+SELECT name, resource from pg_tracing_consume_spans where trace_id=3 order by resource;
+     name     |                 resource                 
+--------------+------------------------------------------
+ Aggregate    | Aggregate
+ Executor     | End
+ Executor     | Finish
+ FunctionScan | FunctionScan on current_database
+ Parse        | Parse
+ Planner      | Planner
+ Post Parse   | Post Parse
+ Executor     | Run
+ Select       | SELECT count(*) from current_database();
+ Executor     | Start
+(10 rows)
+
+-- Trace a more complex query with multiple function calls
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000004-0000000000000004-01'*/ SELECT s.relation_size + s.index_size
+FROM (SELECT
+      pg_relation_size(C.oid) as relation_size,
+      pg_indexes_size(C.oid) as index_size
+    FROM pg_class C) as s limit 1;
+ ?column? 
+----------
+        0
+(1 row)
+
+-- Check the generated span name, resource and order of query with multiple function calls
+SELECT name, resource from pg_tracing_consume_spans where trace_id=4 order by resource;
+     name     |                                                                             resource                                                                             
+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Executor     | End
+ Executor     | Finish
+ Limit        | Limit
+ Parse        | Parse
+ Planner      | Planner
+ Post Parse   | Post Parse
+ Executor     | Run
+ Select       | SELECT s.relation_size + s.index_sizeFROM (SELECT pg_relation_size(C.oid) as relation_size, pg_indexes_size(C.oid) as index_size FROM pg_class C) as s limit $1;
+ SeqScan      | SeqScan on pg_class c
+ Executor     | Start
+ SubqueryScan | SubqueryScan on s
+(11 rows)
+
+-- Check that we're in a correct state after a timeout
+set statement_timeout=200;
+-- Trace query triggering a statement timeout
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000007-0000000000000007-01'*/ select * from pg_sleep(10);
+ERROR:  canceling statement due to statement timeout
+-- Trace a working query after the timeout to check we're in a consistent state
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000008-0000000000000008-01'*/ select 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+-- Check the spans order and error code
+SELECT name, resource, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+     name     |          resource           | sql_error_code 
+--------------+-----------------------------+----------------
+ Parse        | Parse                       | 00000
+ Select       | select * from pg_sleep($1); | 57014
+ Post Parse   | Post Parse                  | 00000
+ Planner      | Planner                     | 00000
+ Executor     | Start                       | 00000
+ Executor     | Run                         | 57014
+ FunctionScan | FunctionScan on pg_sleep    | 57014
+ Parse        | Parse                       | 00000
+ Select       | select $1;                  | 00000
+ Post Parse   | Post Parse                  | 00000
+ Planner      | Planner                     | 00000
+ Executor     | Start                       | 00000
+ Executor     | Run                         | 00000
+ Result       | Result                      | 00000
+ Executor     | Finish                      | 00000
+ Executor     | End                         | 00000
+(16 rows)
+
+-- Cleanup statement setting
+set statement_timeout=0;
+-- Create a prepare statement with comment passed as first parameter
+PREPARE test_prepared (text, integer) AS /*$1*/ SELECT $2;
+-- Execute prepare statement with trace context passed as a parameter
+EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', 1);
+ ?column? 
+----------
+        1
+(1 row)
+
+-- Check generated spans and order
+SELECT trace_id, name, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+ trace_id |   name   |                      resource                       |                                                   parameters                                                   
+----------+----------+-----------------------------------------------------+----------------------------------------------------------------------------------------------------------------
+        9 | Select   | PREPARE test_prepared (text, integer) AS SELECT $2; | $1 = 'dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', $2 = '1'
+        9 | Planner  | Planner                                             | 
+        9 | Executor | Start                                               | 
+        9 | Executor | Run                                                 | 
+        9 | Result   | Result                                              | 
+        9 | Executor | Finish                                              | 
+        9 | Executor | End                                                 | 
+(7 rows)
+
+-- Test prepared statement with generic plan
+SET plan_cache_mode='force_generic_plan';
+-- Execute prepare statement with trace context passed as a parameter and generic plan
+EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000010-0000000000000010-01''', 10);
+ ?column? 
+----------
+       10
+(1 row)
+
+-- Check spans are generated even through generic plan
+SELECT trace_id, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+ trace_id |                      resource                       | parameters 
+----------+-----------------------------------------------------+------------
+       16 | PREPARE test_prepared (text, integer) AS SELECT $2; | 
+       16 | Start                                               | 
+       16 | Run                                                 | 
+       16 | Result                                              | 
+       16 | Finish                                              | 
+       16 | End                                                 | 
+(6 rows)
+
+-- Run a statement with node not executed
+/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000a-000000000000000a-01'*/ select 1 limit 0;
+ ?column? 
+----------
+(0 rows)
+
+-- Not executed node should not generate any spans
+SELECT trace_id, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+ trace_id |      resource       |   parameters   
+----------+---------------------+----------------
+       10 | Parse               | 
+       10 | select $1 limit $2; | $1 = 1, $2 = 0
+       10 | Post Parse          | 
+       10 | Planner             | 
+       10 | Start               | 
+       10 | Run                 | 
+       10 | Limit               | 
+       10 | Finish              | 
+       10 | End                 | 
+(9 rows)
+
+-- Test multiple statements in a single query
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000011-0000000000000012-01'*/ select 1; select 2;
+ ?column? 
+----------
+        1
+(1 row)
+
+ ?column? 
+----------
+        2
+(1 row)
+
+-- Not executed node should not generate any spans
+select resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns;
+  resource  | parameters 
+------------+------------
+ select $1; | $1 = 1
+ Parse      | 
+ Post Parse | 
+ Planner    | 
+ Start      | 
+ Run        | 
+ Result     | 
+ Finish     | 
+ End        | 
+(9 rows)
+
+-- Cleanup
+SET plan_cache_mode='auto';
+DEALLOCATE test_prepared;
diff --git a/contrib/pg_tracing/expected/subxact.out b/contrib/pg_tracing/expected/subxact.out
new file mode 100644
index 0000000000..698778e155
--- /dev/null
+++ b/contrib/pg_tracing/expected/subxact.out
@@ -0,0 +1,75 @@
+-- Create test table
+CREATE TABLE IF NOT EXISTS pg_tracing_test (a int, b char(20));
+-- Enable full sampling
+SET pg_tracing.sample_rate = 1.0;
+-- Start a transaction with subxaction
+BEGIN;
+SAVEPOINT s1;
+INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa');
+SAVEPOINT s2;
+INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa');
+SAVEPOINT s3;
+SELECT 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+COMMIT;
+-- Check that subxact_count is correctly reported
+select resource, parameters, subxact_count from pg_tracing_consume_spans order by span_start, span_start_ns;
+                             resource                             |         parameters         | subxact_count 
+------------------------------------------------------------------+----------------------------+---------------
+ BEGIN;                                                           |                            |             0
+ Parse                                                            |                            |             0
+ Post Parse                                                       |                            |             0
+ BEGIN;                                                           |                            |             0
+ SAVEPOINT $1;                                                    | $1 = s1                    |             0
+ Parse                                                            |                            |             0
+ Post Parse                                                       |                            |             0
+ SAVEPOINT s1;                                                    |                            |             0
+ INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | $1 = 1, $2 = 2, $3 = 'aaa' |             0
+ Parse                                                            |                            |             0
+ Post Parse                                                       |                            |             0
+ Planner                                                          |                            |             0
+ Start                                                            |                            |             0
+ Run                                                              |                            |             0
+ Insert on pg_tracing_test                                        |                            |             1
+ ProjectSet                                                       |                            |             1
+ Result                                                           |                            |             1
+ Finish                                                           |                            |             1
+ End                                                              |                            |             1
+ SAVEPOINT $1;                                                    | $1 = s2                    |             1
+ Parse                                                            |                            |             1
+ Post Parse                                                       |                            |             1
+ SAVEPOINT s2;                                                    |                            |             1
+ INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | $1 = 1, $2 = 2, $3 = 'aaa' |             1
+ Parse                                                            |                            |             1
+ Post Parse                                                       |                            |             1
+ Planner                                                          |                            |             1
+ Start                                                            |                            |             1
+ Run                                                              |                            |             1
+ Insert on pg_tracing_test                                        |                            |             2
+ ProjectSet                                                       |                            |             2
+ Result                                                           |                            |             2
+ Finish                                                           |                            |             2
+ End                                                              |                            |             2
+ SAVEPOINT $1;                                                    | $1 = s3                    |             2
+ Parse                                                            |                            |             2
+ Post Parse                                                       |                            |             2
+ SAVEPOINT s3;                                                    |                            |             2
+ SELECT $1;                                                       | $1 = 1                     |             2
+ Parse                                                            |                            |             2
+ Post Parse                                                       |                            |             2
+ Planner                                                          |                            |             2
+ Start                                                            |                            |             2
+ Run                                                              |                            |             2
+ Result                                                           |                            |             2
+ Finish                                                           |                            |             2
+ End                                                              |                            |             2
+ COMMIT;                                                          |                            |             2
+ Parse                                                            |                            |             2
+ Post Parse                                                       |                            |             2
+ COMMIT;                                                          |                            |             2
+(51 rows)
+
diff --git a/contrib/pg_tracing/expected/utility.out b/contrib/pg_tracing/expected/utility.out
new file mode 100644
index 0000000000..3503c16865
--- /dev/null
+++ b/contrib/pg_tracing/expected/utility.out
@@ -0,0 +1,214 @@
+-- Some utility functions
+CREATE OR REPLACE FUNCTION get_span_start_ns(time_start timestamptz, start_ns smallint) RETURNS bigint AS
+$BODY$
+BEGIN
+    RETURN extract(epoch from time_start) * 1000000000 + start_ns;
+END;
+$BODY$
+LANGUAGE plpgsql;
+CREATE OR REPLACE FUNCTION get_span_end_ns(time_start timestamptz, start_ns smallint, duration bigint) RETURNS bigint AS
+$BODY$
+BEGIN
+    RETURN get_span_start_ns(time_start, start_ns) + duration;
+END;
+$BODY$
+LANGUAGE plpgsql;
+-- Create pg_tracing extension with sampling on
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing;
+-- This will create the following spans (non exhaustive list):
+--
+-- +-------------------------------------------------------------------------------------+
+-- | A: Utility: Create Extension                                                        |
+-- +-+-----------------------------------------------------------------------------------+
+--   +----------------------------------------------------------------------------------+
+--   |B: ProcessUtility: Create Extension                                               |
+--   +---+-----------------------------------+---+--------------------------------------+
+--       +-----------------------------------+   +-------------------------------------+
+--       |C: Utility: Create Function1       |   |E: Utility: Create Function2         |
+--       ++----------------------------------+   ++-----------------------------------++
+--        +----------------------------------+    +-----------------------------------+
+--        |D: ProcessUtility: Create Function|    |F: ProcessUtility: Create Function2|
+--        +----------------------------------+    +-----------------------------------+
+-- Extract span_ids, start and end of those spans
+SELECT span_id AS span_a_id,
+        get_span_start_ns(span_start, span_start_ns) as span_a_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_a_end
+		from pg_tracing_peek_spans where parent_id=1 and name='Utility' \gset
+SELECT span_id AS span_b_id,
+        get_span_start_ns(span_start, span_start_ns) as span_b_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_b_end
+		from pg_tracing_peek_spans where parent_id=:span_a_id and name='ProcessUtility' \gset
+SELECT span_id AS span_c_id,
+        get_span_start_ns(span_start, span_start_ns) as span_c_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_c_end
+		from pg_tracing_peek_spans where parent_id=:span_b_id and name='Utility' limit 1 \gset
+SELECT span_id AS span_d_id,
+        get_span_start_ns(span_start, span_start_ns) as span_d_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_d_end
+		from pg_tracing_peek_spans where parent_id=:span_c_id and name='ProcessUtility' \gset
+SELECT span_id AS span_e_id,
+        get_span_start_ns(span_start, span_start_ns) as span_e_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_e_end
+		from pg_tracing_peek_spans where parent_id=:span_b_id and name='Utility' limit 1 offset 1 \gset
+-- Check that the start and end of those spans are within expectation
+SELECT :span_a_start < :span_b_start AS span_a_start_first,
+		:span_a_end >= :span_b_end AS span_a_end_last,
+		:span_d_end <= :span_c_end AS nested_span_ends_before_parent,
+		:span_c_end <= :span_e_start AS next_utility_start_after;
+ span_a_start_first | span_a_end_last | nested_span_ends_before_parent | next_utility_start_after 
+--------------------+-----------------+--------------------------------+--------------------------
+ t                  | t               | t                              | t
+(1 row)
+
+-- Clean current spans
+select count(*) from pg_tracing_consume_spans;
+ count 
+-------
+    49
+(1 row)
+
+--
+-- Test that no utility is captured with track_utility off
+--
+-- Set utility off
+SET pg_tracing.track_utility = off;
+-- Prepare and execute a prepared statement
+PREPARE test_prepared_one_param (integer) AS SELECT $1;
+EXECUTE test_prepared_one_param(100);
+ ?column? 
+----------
+      100
+(1 row)
+
+-- Nothing should be generated
+select count(*) from pg_tracing_consume_spans;
+ count 
+-------
+     0
+(1 row)
+
+-- Force a query to start from ExecutorRun
+SET plan_cache_mode='force_generic_plan';
+EXECUTE test_prepared_one_param(200);
+ ?column? 
+----------
+      200
+(1 row)
+
+-- Again, nothing should be generated
+select count(*) from pg_tracing_consume_spans;
+ count 
+-------
+     0
+(1 row)
+
+--
+-- Test that no utility is captured with track_utility off
+--
+-- Enable utility tracking and track everything
+SET pg_tracing.track_utility = on;
+SET pg_tracing.sample_rate = 1.0;
+-- Prepare and execute a prepared statement
+PREPARE test_prepared_one_param_2 (integer) AS SELECT $1;
+EXECUTE test_prepared_one_param_2(100);
+ ?column? 
+----------
+      100
+(1 row)
+
+-- Check the number of generated spans
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+ count 
+-------
+     2
+(1 row)
+
+-- Check that the spans are in correct order
+select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns;
+                         resource                          | parameters 
+-----------------------------------------------------------+------------
+ PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | 
+ Parse                                                     | 
+ Post Parse                                                | 
+ PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | 
+ PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | 
+ Parse                                                     | 
+ Post Parse                                                | 
+ EXECUTE test_prepared_one_param_2(100);                   | 
+ Parse                                                     | 
+ Post Parse                                                | 
+ EXECUTE test_prepared_one_param_2(100);                   | 
+ PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | 
+ Planner                                                   | 
+ Start                                                     | 
+ Run                                                       | 
+ Result                                                    | 
+ Finish                                                    | 
+ End                                                       | 
+(18 rows)
+
+-- Check the top span (standalone top span has trace_id=parent_id=span_id)
+select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns;
+                         resource                          | parameters 
+-----------------------------------------------------------+------------
+ PREPARE test_prepared_one_param_2 (integer) AS SELECT $1; | 
+ EXECUTE test_prepared_one_param_2(100);                   | 
+(2 rows)
+
+-- Test prepared statement with generic plan
+SET plan_cache_mode='force_generic_plan';
+EXECUTE test_prepared_one_param(200);
+ ?column? 
+----------
+      200
+(1 row)
+
+-- Check the number of generated spans
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+ count 
+-------
+     2
+(1 row)
+
+-- Check that the spans are in correct order
+select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns;
+                        resource                         | parameters 
+---------------------------------------------------------+------------
+ SET plan_cache_mode='force_generic_plan';               | 
+ Parse                                                   | 
+ Post Parse                                              | 
+ SET plan_cache_mode='force_generic_plan';               | 
+ EXECUTE test_prepared_one_param(200);                   | 
+ Parse                                                   | 
+ Post Parse                                              | 
+ EXECUTE test_prepared_one_param(200);                   | 
+ PREPARE test_prepared_one_param (integer) AS SELECT $1; | 
+ Start                                                   | 
+ Run                                                     | 
+ Result                                                  | 
+ Finish                                                  | 
+ End                                                     | 
+(14 rows)
+
+-- Check the top span (standalone top span has trace_id=parent_id=span_id)
+select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns;
+                 resource                  | parameters 
+-------------------------------------------+------------
+ SET plan_cache_mode='force_generic_plan'; | 
+ EXECUTE test_prepared_one_param(200);     | 
+(2 rows)
+
+-- Second create extension should generate an error that is captured by span
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing;
+ERROR:  extension "pg_tracing" already exists
+select resource, parameters, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns;
+           resource           | parameters | sql_error_code 
+------------------------------+------------+----------------
+ CREATE EXTENSION pg_tracing; |            | 42710
+ Parse                        |            | 00000
+ Post Parse                   |            | 00000
+ CREATE EXTENSION pg_tracing; |            | 42710
+(4 rows)
+
+-- Cleanup
+SET plan_cache_mode='auto';
diff --git a/contrib/pg_tracing/expected/wal.out b/contrib/pg_tracing/expected/wal.out
new file mode 100644
index 0000000000..9a22043ea3
--- /dev/null
+++ b/contrib/pg_tracing/expected/wal.out
@@ -0,0 +1,96 @@
+-- Create test table
+CREATE TABLE IF NOT EXISTS pg_tracing_test (a int, b char(20));
+NOTICE:  relation "pg_tracing_test" already exists, skipping
+-- Enable wal instrumentation
+set pg_tracing.instrument_wal = true;
+-- Generate queries with wal write
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa');
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7;
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9;
+-- Check WAL is generated for the above statements
+SELECT trace_id, name, resource,
+       wal_records > 0 as wal_records,
+       wal_bytes > 0 as wal_bytes
+FROM pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+ trace_id |    name    |                             resource                             | wal_records | wal_bytes 
+----------+------------+------------------------------------------------------------------+-------------+-----------
+        1 | Insert     | INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | t           | t
+        1 | Parse      | Parse                                                            |             | 
+        1 | Post Parse | Post Parse                                                       |             | 
+        1 | Planner    | Planner                                                          | f           | f
+        1 | Executor   | Start                                                            |             | 
+        1 | Executor   | Run                                                              |             | 
+        1 | Insert     | Insert on pg_tracing_test                                        | t           | t
+        1 | ProjectSet | ProjectSet                                                       | f           | f
+        1 | Result     | Result                                                           | f           | f
+        1 | Executor   | Finish                                                           |             | 
+        1 | Executor   | End                                                              |             | 
+        2 | Parse      | Parse                                                            |             | 
+        2 | Update     | UPDATE pg_tracing_test SET b = $1 WHERE a > $2;                  | t           | t
+        2 | Post Parse | Post Parse                                                       |             | 
+        2 | Planner    | Planner                                                          | f           | f
+        2 | Executor   | Start                                                            |             | 
+        2 | Executor   | Run                                                              |             | 
+        2 | Update     | Update on pg_tracing_test                                        | t           | t
+        2 | SeqScan    | SeqScan on pg_tracing_test|Filter : (a > 7)                      | f           | f
+        2 | Executor   | Finish                                                           |             | 
+        2 | Executor   | End                                                              |             | 
+        3 | Delete     | DELETE FROM pg_tracing_test WHERE a > $1;                        | t           | t
+        3 | Parse      | Parse                                                            |             | 
+        3 | Post Parse | Post Parse                                                       |             | 
+        3 | Planner    | Planner                                                          | f           | f
+        3 | Executor   | Start                                                            |             | 
+        3 | Executor   | Run                                                              |             | 
+        3 | Delete     | Delete on pg_tracing_test                                        | t           | t
+        3 | SeqScan    | SeqScan on pg_tracing_test|Filter : (a > 9)                      | f           | f
+        3 | Executor   | Finish                                                           |             | 
+        3 | Executor   | End                                                              |             | 
+(31 rows)
+
+-- Redo the same but without wal instrumentation
+set pg_tracing.instrument_wal = false;
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa');
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7;
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9;
+-- With wal instrumentation disabled, wal counters should be set to 0
+SELECT trace_id, name, resource,
+       wal_records = 0 as wal_records,
+       wal_bytes = 0 as wal_bytes
+FROM pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+ trace_id |    name    |                             resource                             | wal_records | wal_bytes 
+----------+------------+------------------------------------------------------------------+-------------+-----------
+        1 | Insert     | INSERT INTO pg_tracing_test VALUES(generate_series($1, $2), $3); | t           | t
+        1 | Parse      | Parse                                                            |             | 
+        1 | Post Parse | Post Parse                                                       |             | 
+        1 | Planner    | Planner                                                          | t           | t
+        1 | Executor   | Start                                                            |             | 
+        1 | Executor   | Run                                                              |             | 
+        1 | Insert     | Insert on pg_tracing_test                                        | t           | t
+        1 | ProjectSet | ProjectSet                                                       | t           | t
+        1 | Result     | Result                                                           | t           | t
+        1 | Executor   | Finish                                                           |             | 
+        1 | Executor   | End                                                              |             | 
+        2 | Parse      | Parse                                                            |             | 
+        2 | Update     | UPDATE pg_tracing_test SET b = $1 WHERE a > $2;                  | t           | t
+        2 | Post Parse | Post Parse                                                       |             | 
+        2 | Planner    | Planner                                                          | t           | t
+        2 | Executor   | Start                                                            |             | 
+        2 | Executor   | Run                                                              |             | 
+        2 | Update     | Update on pg_tracing_test                                        | t           | t
+        2 | SeqScan    | SeqScan on pg_tracing_test|Filter : (a > 7)                      | t           | t
+        2 | Executor   | Finish                                                           |             | 
+        2 | Executor   | End                                                              |             | 
+        3 | Delete     | DELETE FROM pg_tracing_test WHERE a > $1;                        | t           | t
+        3 | Parse      | Parse                                                            |             | 
+        3 | Post Parse | Post Parse                                                       |             | 
+        3 | Planner    | Planner                                                          | t           | t
+        3 | Executor   | Start                                                            |             | 
+        3 | Executor   | Run                                                              |             | 
+        3 | Delete     | Delete on pg_tracing_test                                        | t           | t
+        3 | SeqScan    | SeqScan on pg_tracing_test|Filter : (a > 9)                      | t           | t
+        3 | Executor   | Finish                                                           |             | 
+        3 | Executor   | End                                                              |             | 
+(31 rows)
+
+-- Cleanup
+set pg_tracing.instrument_wal = true;
diff --git a/contrib/pg_tracing/meson.build b/contrib/pg_tracing/meson.build
new file mode 100644
index 0000000000..1d1ae13a43
--- /dev/null
+++ b/contrib/pg_tracing/meson.build
@@ -0,0 +1,52 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+pg_tracing_sources = files(
+  'pg_tracing.c',
+  'query_process.c',
+  'pg_tracing_explain.c',
+  'span.c',
+)
+
+if host_system == 'windows'
+  pg_tracing_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'pg_tracing',
+    '--FILEDESC', 'pg_tracing - Add support for distributed tracing',])
+endif
+
+pg_tracing = shared_module('pg_tracing',
+  pg_tracing_sources,
+  kwargs: contrib_mod_args + {
+    'dependencies': contrib_mod_args['dependencies'],
+  },
+)
+contrib_targets += pg_tracing
+
+install_data(
+  'pg_tracing.control',
+  'pg_tracing--1.0.sql',
+  kwargs: contrib_data_args,
+)
+
+tests += {
+  'name': 'pg_tracing',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+	'sql': [
+	  'utility',
+	  'select',
+	  'sample',
+	  'subxact',
+      'reset',
+      'nested',
+      'wal',
+      'parallel',
+      'cleanup'
+    ],
+	'regress_args': ['--temp-config', files('pg_tracing.conf')],
+    # Disabled because these tests require
+    # "shared_preload_libraries=pg_tracing", which typical
+    # runningcheck users do not have (e.g. buildfarm clients).
+    'runningcheck': false,
+  },
+}
diff --git a/contrib/pg_tracing/pg_tracing--1.0.sql b/contrib/pg_tracing/pg_tracing--1.0.sql
new file mode 100644
index 0000000000..544dcb71fe
--- /dev/null
+++ b/contrib/pg_tracing/pg_tracing--1.0.sql
@@ -0,0 +1,99 @@
+/* contrib/pg_tracing/pg_tracing--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "CREATE EXTENSION pg_tracing" to load this file. \quit
+
+--- Define pg_tracing_info
+CREATE FUNCTION pg_tracing_info(
+    OUT traces bigint,
+    OUT spans bigint,
+    OUT dropped_spans bigint,
+    OUT failed_truncates bigint,
+    OUT last_consume timestamp with time zone,
+    OUT stats_reset timestamp with time zone,
+    OUT sampling_rate double precision
+)
+RETURNS record
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT VOLATILE PARALLEL SAFE;
+
+CREATE FUNCTION pg_tracing_reset()
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE FUNCTION pg_tracing_spans(
+    IN consume bool,
+    OUT trace_id bigint,
+    OUT parent_id bigint,
+    OUT span_id bigint,
+    OUT query_id bigint,
+    OUT name text,
+    OUT resource text,
+    OUT span_start timestamp with time zone,
+    OUT span_start_ns smallint,
+    OUT duration bigint,
+    OUT sql_error_code character(5),
+    OUT pid int4,
+    OUT nested_level smallint,
+    OUT subxact_count smallint,
+    OUT top_span bool,
+
+--  Plan counters
+    OUT startup_cost float8,
+    OUT total_cost float8,
+    OUT plan_rows float8,
+    OUT plan_width int,
+
+-- Node Counters
+    OUT rows int8,
+    OUT nloops int8,
+
+    OUT shared_blks_hit int8,
+    OUT shared_blks_read int8,
+    OUT shared_blks_dirtied int8,
+    OUT shared_blks_written int8,
+
+    OUT local_blks_hit int8,
+    OUT local_blks_read int8,
+    OUT local_blks_dirtied int8,
+    OUT local_blks_written int8,
+
+    OUT blk_read_time float8,
+    OUT blk_write_time float8,
+
+    OUT temp_blks_read int8,
+    OUT temp_blks_written int8,
+    OUT temp_blk_read_time float8,
+    OUT temp_blk_write_time float8,
+
+    OUT wal_records int8,
+    OUT wal_fpi int8,
+    OUT wal_bytes numeric,
+
+    OUT jit_functions int8,
+    OUT jit_generation_time float8,
+    OUT jit_inlining_time float8,
+    OUT jit_optimization_time float8,
+    OUT jit_emission_time float8,
+
+--  SpanNode specific data
+    OUT startup bigint, -- First tuple
+    OUT parameters text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE VIEW pg_tracing_info AS
+  SELECT * FROM pg_tracing_info();
+
+CREATE VIEW pg_tracing_peek_spans AS
+  SELECT * FROM pg_tracing_spans(false);
+
+CREATE VIEW pg_tracing_consume_spans AS
+  SELECT * FROM pg_tracing_spans(true);
+
+GRANT SELECT ON pg_tracing_info TO PUBLIC;
+GRANT SELECT ON pg_tracing_peek_spans TO PUBLIC;
+GRANT SELECT ON pg_tracing_consume_spans TO PUBLIC;
diff --git a/contrib/pg_tracing/pg_tracing.c b/contrib/pg_tracing/pg_tracing.c
new file mode 100644
index 0000000000..35e62abcbd
--- /dev/null
+++ b/contrib/pg_tracing/pg_tracing.c
@@ -0,0 +1,2313 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_tracing.c
+ *		Generate spans for distributed tracing from SQL query
+ *
+ * Spans will only be generated for sampled queries. A query is sampled if:
+ * - It has a tracecontext propagated throught SQLCommenter and it passes the caller_sample_rate.
+ * - It has no SQLCommenter but the query randomly passes the global sample_rate
+ *
+ * A query with sqlcommenter will look like: /\*dddbs='postgres.db',traceparent='00-00000000000000000000000000000009-0000000000000005-01'*\/ select 1;
+ * The traceparent fields are detailed in https://www.w3.org/TR/trace-context/#traceparent-header-field-values
+ * 00000000000000000000000000000009: trace id
+ * 0000000000000005: parent id
+ * 01: trace flags (01 == sampled)
+ *
+ * If sampled, we will generate spans for the ongoing query.
+ * A span represents an operation with a start time, a duration and useful metadatas (block stats, wal stats...).
+ * We will track the following operations:
+ * - Top Span: The top span for a query. They are created after extracting the traceid from traceparent or to represent a nested query.
+ * - Planner: We track the time spent in the planner and report the planner counters
+ * - Node Span: Created from planstate. The name is extracted from the node type (IndexScan, SeqScan).
+ * - Executor: We trace the different steps of the Executor: Start, Run, Finish and End
+ *
+ * A typical traced query will generate the following spans:
+ * +------------------------------------------------------------------------------------------------------------------------------------------------------+
+ * | Name: Select (top span)                                                                                                                              |
+ * | Operation: Select * pgbench_accounts WHERE aid=$1;                                                                                                   |
+ * +---+------------------------+-+------------------+--+------------------------------------------------------+-+--------------------+-+----------------++
+ *     | Name: Planner          | | Name: Executor   |  |Name: Executor                                        | | Name: Executor     | | Name: Executor |
+ *     | Operation: Planner     | | Operation: Start |  |Operation: Run                                        | | Operation: Finish  | | Operation: End |
+ *     +------------------------+ +------------------+  +--+--------------------------------------------------++ +--------------------+ +----------------+
+ *                                                         | Name: IndexScan                                  |
+ *                                                         | Operation: IndexScan using pgbench_accounts_pkey |
+ *                                                         |            on pgbench_accounts                   |
+ *                                                         +--------------------------------------------------+
+ *
+ * IDENTIFICATION
+ *	  contrib/pg_tracing/pg_tracing.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pg_tracing_explain.h"
+#include "pg_tracing.h"
+#include "query_process.h"
+#include "span.h"
+
+#include "utils/ruleutils.h"
+#include "access/xact.h"
+#include "common/pg_prng.h"
+#include "funcapi.h"
+#include "nodes/extensible.h"
+#include "optimizer/planner.h"
+#include "parser/analyze.h"
+#include "storage/ipc.h"
+#include "tcop/utility.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/xid8.h"
+
+PG_MODULE_MAGIC;
+
+typedef enum
+{
+	PG_TRACING_TRACK_NONE,		/* track no statements */
+	PG_TRACING_TRACK_TOP,		/* only top level statements */
+	PG_TRACING_TRACK_ALL		/* all statements, including nested ones */
+}			pgTracingTrackLevel;
+
+/*
+ * Structure to store flexible array of spans
+ */
+typedef struct pgTracingSpans
+{
+	int			end;			/* Index of last element */
+	int			max;			/* Maximum number of element */
+	Span		spans[FLEXIBLE_ARRAY_MEMBER];
+}			pgTracingSpans;
+
+/*
+ * A trace context for a specific parallel context
+ */
+typedef struct pgTracingParallelContext
+{
+	BackendId	leader_backend_id;	/* Backend id of the leader, set to
+									 * InvalidBackendId if unused */
+	uint64		trace_id;		/* Trace id of the sampled query */
+	uint64		parent_id;		/* Id of the parent Gather span */
+	uint64		query_id;		/* Query id from the parent worker */
+	pgTracingStartTime start_trace;
+
+}			pgTracingParallelContext;
+
+/*
+ * Store context for parallel workers
+ */
+typedef struct pgTracingParallelWorkers
+{
+	slock_t		mutex;
+	pgTracingParallelContext trace_contexts[FLEXIBLE_ARRAY_MEMBER];
+}			pgTracingParallelWorkers;
+
+/* GUC variables */
+static int	pg_tracing_max_span;	/* Maximum number of spans to store */
+static int	pg_tracing_max_parameter_str;	/* Maximum number of spans to
+											 * store */
+static int	pg_tracing_max_traced_parallel_workers; /* Maximum number of
+													 * parallel workers traced */
+static int	pg_tracing_initial_allocated_spans = 25;	/* Number of spans
+														 * allocated at the
+														 * start of a trace at
+														 * the same time. */
+static bool pg_tracing_instrument_buffers;	/* Enable buffers instrumentation */
+static bool pg_tracing_instrument_wal;	/* Enable WAL instrumentation */
+static double pg_tracing_sample_rate = 0;	/* Sample rate applied to queries
+											 * without SQLCommenter */
+static double pg_tracing_caller_sample_rate = 1;	/* Sample rate applied to
+													 * queries with
+													 * SQLCommenter */
+static int	pg_tracing_track = PG_TRACING_TRACK_ALL;	/* tracking level */
+static bool pg_tracing_track_utility = true;	/* whether to track utility
+												 * commands */
+static bool pg_tracing_drop_full_buffer = true; /* whether to drop buffered
+												 * spans when full */
+
+
+int64		nested_query_start_ns = 0;
+int64		executor_start_start_ns = 0;
+
+static const struct config_enum_entry track_options[] =
+{
+	{"none", PG_TRACING_TRACK_NONE, false},
+	{"top", PG_TRACING_TRACK_TOP, false},
+	{"all", PG_TRACING_TRACK_ALL, false},
+	{NULL, 0, false}
+};
+
+#define pg_tracing_enabled(level) \
+	((pg_tracing_track == PG_TRACING_TRACK_ALL || \
+	(pg_tracing_track == PG_TRACING_TRACK_TOP && (level) == 0)))
+
+PG_FUNCTION_INFO_V1(pg_tracing_info);
+PG_FUNCTION_INFO_V1(pg_tracing_spans);
+PG_FUNCTION_INFO_V1(pg_tracing_reset);
+
+/*
+ * Global variables
+ */
+
+/* Memory context for pg_tracing. */
+static MemoryContext pg_tracing_mem_ctx;
+
+/* Current trace state */
+static struct pgTracingTrace current_trace;
+
+/* Shared state with stats and file external state */
+static pgTracingSharedState * pg_tracing = NULL;
+
+/*
+ * Shared buffer storing spans. Query with sampled flag will add new spans to
+ * the shared state. Those spans will be consumed during calls to
+ * pg_tracing_spans
+ */
+static pgTracingSpans * shared_spans = NULL;
+
+/*
+ * Store all spans for the curent trace except top spans and parse spans.
+ * They will be added to the shared buffer at the end of query tracing.
+ *
+ * This buffer can be dynamically resized so care needs to be taken when using span pointers.
+ */
+static pgTracingSpans * current_trace_spans;
+
+/*
+ * Shared buffer storing trace context for parallel workers.
+ */
+static pgTracingParallelWorkers * pg_tracing_parallel = NULL;
+
+/* Index of the parallel worker context shared buffer if any */
+static int	parallel_context_index = -1;
+
+/*
+* Text for spans are buffered in this
+* stringinfo and written at the end of the tracing in a single write.
+*/
+static StringInfo current_trace_text;
+
+/*
+ * Maximum nested level for a query to know how many top spans we need to
+ * copy in shared_spans
+ */
+static int	max_nested_level = -1;
+
+/* Current nesting depth of planner calls */
+static int	plan_nested_level = 0;
+
+/* Current nesting depth of ExecutorRun+ProcessUtility calls */
+static int	exec_nested_level = 0;
+
+/* Number of allocated levels. */
+static int	allocated_nested_level = 0;
+
+/*
+ * Per nested level buffers
+ */
+typedef struct pgTracingPerLevelBuffer
+{
+	int			top_span_index; /* Index of the top span in the
+								 * current_trace_spans buffer */
+	uint64		executor_id;	/* Span id of executor run spans by nested
+								 * level Executor run is used as parent for
+								 * spans generated from planstate */
+	uint64_t	query_id;		/* Query id by nested level when available */
+}			pgTracingPerLevelBuffer;
+
+static pgTracingPerLevelBuffer * per_level_buffers;
+
+/*
+ * Timestamp of the latest statement checked for sampling.
+ */
+static TimestampTz last_statement_check_for_sampling = 0;
+
+static void pg_tracing_shmem_request(void);
+static void pg_tracing_shmem_startup(void);
+
+/* Saved hook values in case of unload */
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL;
+static planner_hook_type prev_planner_hook = NULL;
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+static ExecutorRun_hook_type prev_ExecutorRun = NULL;
+static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
+static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
+static ProcessUtility_hook_type prev_ProcessUtility = NULL;
+
+static void pg_tracing_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate);
+static PlannedStmt *pg_tracing_planner_hook(Query *parse,
+											const char *query_string,
+											int cursorOptions,
+											ParamListInfo params);
+static void pg_tracing_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void pg_tracing_ExecutorRun(QueryDesc *queryDesc,
+								   ScanDirection direction,
+								   uint64 count, bool execute_once);
+static void pg_tracing_ExecutorFinish(QueryDesc *queryDesc);
+static void pg_tracing_ExecutorEnd(QueryDesc *queryDesc);
+static void pg_tracing_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
+									  bool readOnlyTree,
+									  ProcessUtilityContext context, ParamListInfo params,
+									  QueryEnvironment *queryEnv,
+									  DestReceiver *dest, QueryCompletion *qc);
+
+static void generate_member_nodes(PlanState **planstates, int nplans, planstateTraceContext * planstateTraceContext, uint64 parent_id);
+static void generate_span_from_planstate(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id);
+static pgTracingStats get_empty_pg_tracing_stats(void);
+
+#define PG_TRACING_INFO_COLS	7
+#define PG_TRACING_TRACES_COLS	44
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+
+	DefineCustomIntVariable("pg_tracing.max_span",
+							"Maximum number of spans stored in shared memory.",
+							NULL,
+							&pg_tracing_max_span,
+							1000,
+							0,
+							500000,
+							PGC_SUSET,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomIntVariable("pg_tracing.max_parameter_size",
+							"Maximum size of parameters. -1 to disable parameter in top span.",
+							NULL,
+							&pg_tracing_max_parameter_str,
+							1024,
+							0,
+							10000,
+							PGC_SUSET,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomBoolVariable("pg_tracing.instrument_buffers",
+							 "Instrument and add buffers usage in spans.",
+							 NULL,
+							 &pg_tracing_instrument_buffers,
+							 true,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	DefineCustomBoolVariable("pg_tracing.instrument_wal",
+							 "Instrument and add WAL usage in spans.",
+							 NULL,
+							 &pg_tracing_instrument_wal,
+							 true,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	DefineCustomIntVariable("pg_tracing.max_traced_parallel_workers",
+							"Maximum number of parallel workers traced. -1 to disable tracing of parallel workers.",
+							NULL,
+							&pg_tracing_max_traced_parallel_workers,
+							100,
+							0,
+							10000,
+							PGC_POSTMASTER,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomIntVariable("pg_tracing.initial_allocated_spans",
+							"The number of allocated spans at the start of a trace. A higher number can help avoid reallocation when extending the buffer.",
+							NULL,
+							&pg_tracing_initial_allocated_spans,
+							25,
+							0,
+							500,
+							PGC_USERSET,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomEnumVariable("pg_tracing.track",
+							 "Selects which statements are tracked by pg_tracing.",
+							 NULL,
+							 &pg_tracing_track,
+							 PG_TRACING_TRACK_ALL,
+							 track_options,
+							 PGC_USERSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	DefineCustomBoolVariable("pg_tracing.track_utility",
+							 "Selects whether utility commands are tracked by pg_tracing.",
+							 NULL,
+							 &pg_tracing_track_utility,
+							 true,
+							 PGC_USERSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	DefineCustomBoolVariable("pg_tracing.drop_on_full_buffer",
+							 "Selects whether buffered spans should be dropped when buffer is full.",
+							 NULL,
+							 &pg_tracing_drop_full_buffer,
+							 true,
+							 PGC_USERSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	DefineCustomRealVariable("pg_tracing.sample_rate",
+							 "Fraction of queries to process.",
+							 NULL,
+							 &pg_tracing_sample_rate,
+							 0.0,
+							 0.0,
+							 1.0,
+							 PGC_USERSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	DefineCustomRealVariable("pg_tracing.caller_sample_rate",
+							 "Fraction of queries with sampled flag to process.",
+							 NULL,
+							 &pg_tracing_caller_sample_rate,
+							 1.0,
+							 0.0,
+							 1.0,
+							 PGC_USERSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	MarkGUCPrefixReserved("pg_tracing");
+
+	/* For jumble state */
+	EnableQueryId();
+
+	/* Install hooks. */
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = pg_tracing_shmem_request;
+
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = pg_tracing_shmem_startup;
+
+	prev_post_parse_analyze_hook = post_parse_analyze_hook;
+	post_parse_analyze_hook = pg_tracing_post_parse_analyze;
+
+	prev_planner_hook = planner_hook;
+	planner_hook = pg_tracing_planner_hook;
+
+	prev_ExecutorStart = ExecutorStart_hook;
+	ExecutorStart_hook = pg_tracing_ExecutorStart;
+	prev_ExecutorRun = ExecutorRun_hook;
+	ExecutorRun_hook = pg_tracing_ExecutorRun;
+	prev_ExecutorFinish = ExecutorFinish_hook;
+	ExecutorFinish_hook = pg_tracing_ExecutorFinish;
+	prev_ExecutorEnd = ExecutorEnd_hook;
+	ExecutorEnd_hook = pg_tracing_ExecutorEnd;
+
+	prev_ProcessUtility = ProcessUtility_hook;
+	ProcessUtility_hook = pg_tracing_ProcessUtility;
+}
+
+/*
+ * Estimate shared memory space needed.
+ */
+static Size
+pg_tracing_memsize(void)
+{
+	Size		size;
+
+	/* pg_tracing shared state */
+	size = sizeof(pgTracingSharedState);
+	/* span struct */
+	size = add_size(size, sizeof(pgTracingSpans));
+	/* the span variable array */
+	size = add_size(size, mul_size(pg_tracing_max_span, sizeof(Span)));
+	/* and the parallel workers context  */
+	size = add_size(size, mul_size(pg_tracing_max_traced_parallel_workers, sizeof(pgTracingParallelContext)));
+
+	return size;
+}
+
+/*
+ * shmem_request hook: request additional shared resources.  We'll allocate
+ * or attach to the shared resources in pgss_shmem_startup().
+ */
+static void
+pg_tracing_shmem_request(void)
+{
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+	RequestAddinShmemSpace(pg_tracing_memsize());
+}
+
+/*
+ * Push trace context to the shared parallel worker buffer
+ */
+static void
+add_parallel_context(uint64 parent_id, uint64 query_id)
+{
+	volatile	pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel;
+
+	if (pg_tracing_max_traced_parallel_workers <= 0)
+		/* No tracing of parallel workers */
+		return;
+
+	Assert(parallel_context_index == -1);
+	SpinLockAcquire(&p->mutex);
+	for (int i = 0; i < pg_tracing_max_traced_parallel_workers; i++)
+	{
+		volatile	pgTracingParallelContext *ctx = p->trace_contexts + i;
+
+		Assert(ctx->leader_backend_id != MyBackendId);
+		if (ctx->leader_backend_id != InvalidBackendId)
+			continue;
+		/* Slot is available */
+		parallel_context_index = i;
+		ctx->leader_backend_id = MyBackendId;
+		ctx->trace_id = current_trace.trace_id;
+		ctx->parent_id = parent_id;
+		ctx->query_id = query_id;
+		ctx->start_trace = current_trace.start_trace;
+		break;
+	}
+	SpinLockRelease(&p->mutex);
+}
+
+/*
+ * Remove parallel context for the current leader from the shared memory.
+ */
+static void
+remove_parallel_context(void)
+{
+	if (parallel_context_index < 0)
+		/* No tracing of parallel workers */
+		return;
+	{
+		volatile	pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel;
+
+		SpinLockAcquire(&p->mutex);
+		p->trace_contexts[parallel_context_index].leader_backend_id = InvalidBackendId;
+		SpinLockRelease(&p->mutex);
+	}
+	parallel_context_index = -1;
+}
+
+/*
+ * If we're inside a parallel worker, check if the trace context is stored in shared memory.
+ * If a trace context exists, it means that the query is sampled and worker tracing is enabled.
+ */
+static void
+fetch_parallel_context(pgTracingTrace * trace)
+{
+	volatile	pgTracingParallelWorkers *p = (volatile pgTracingParallelWorkers *) pg_tracing_parallel;
+
+	if (pg_tracing_max_traced_parallel_workers <= 0)
+	{
+		/* No tracing of parallel workers */
+		return;
+	}
+	SpinLockAcquire(&p->mutex);
+	for (int i = 0; i < pg_tracing_max_traced_parallel_workers; i++)
+	{
+		if (p->trace_contexts[i].leader_backend_id != ParallelLeaderBackendId)
+			continue;
+		/* Found a matching a trace context, fetch it */
+		trace->trace_id = p->trace_contexts[i].trace_id;
+		trace->parent_id = p->trace_contexts[i].parent_id;
+		trace->start_trace = p->trace_contexts[i].start_trace;
+		trace->query_id = p->trace_contexts[i].query_id;
+		trace->sampled = true;
+	}
+	SpinLockRelease(&p->mutex);
+}
+
+/*
+ * Return a slot from the current_trace_spans buffer.
+ * current_trace_spans may be extended if needed.
+ */
+static int
+get_index_from_trace_spans(void)
+{
+	if (current_trace_spans->end + 1 >= current_trace_spans->max)
+	{
+		MemoryContext oldcxt;
+
+		/* Need to extend. */
+		/* TODO: Have a configurable limit? */
+		current_trace_spans->max *= 2;
+		oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx);
+		current_trace_spans = repalloc(current_trace_spans, current_trace_spans->max * sizeof(Span));
+		MemoryContextSwitchTo(oldcxt);
+	}
+	return current_trace_spans->end++;
+}
+
+/*
+ * Get current_span from provided index
+ */
+static Span *
+get_span_from_index(int span_index)
+{
+	Assert(span_index < current_trace_spans->end);
+	return &current_trace_spans->spans[span_index];
+}
+
+/*
+ * Get top span for the provided nested level.
+ * If top_span doesn't exist, create it.
+ */
+static Span *
+get_top_span_for_nested_level(int nested_level)
+{
+	Span	   *top_span;
+
+	Assert(nested_level <= max_nested_level);
+	if (per_level_buffers[nested_level].top_span_index == -1)
+		per_level_buffers[nested_level].top_span_index = get_index_from_trace_spans();
+	top_span = get_span_from_index(per_level_buffers[nested_level].top_span_index);
+	return top_span;
+}
+
+/*
+ * Set top span to the provided span index
+ */
+static void
+set_top_span(int nested_level, int top_span_index)
+{
+	Assert(nested_level <= max_nested_level);
+	Assert(top_span_index < current_trace_spans->max);
+	per_level_buffers[nested_level].top_span_index = top_span_index;
+}
+
+/*
+ * End the span at the provided index
+ */
+static void
+end_span_index(int span_index, const int64 *end_span_input)
+{
+	Span	   *span;
+
+	if (span_index < 0)
+		return;
+	span = get_span_from_index(span_index);
+	end_span(span, end_span_input);
+}
+
+/*
+ * End top span for the provided nested level
+ */
+static void
+end_top_span(int nested_level, int64 end_span_ns)
+{
+	Assert(nested_level <= max_nested_level);
+	Assert(per_level_buffers[nested_level].top_span_index < current_trace_spans->max);
+	Assert(per_level_buffers[nested_level].top_span_index != -1);
+	end_span_index(per_level_buffers[nested_level].top_span_index, &end_span_ns);
+}
+
+/*
+ * shmem_startup hook: allocate or attach to shared memory, Also create and
+ * load the query-texts file, which is expected to exist (even if empty)
+ * while the module is enabled.
+ */
+static void
+pg_tracing_shmem_startup(void)
+{
+	bool		found_pg_tracing;
+	bool		found_shared_spans;
+	bool		found_parallel;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	/* reset in case this is a restart within the postmaster */
+	pg_tracing = NULL;
+
+	/* Create or attach to the shared memory state */
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	current_trace.sampled = 0;
+	pg_tracing = ShmemInitStruct("PgTracing Shared", sizeof(pgTracingSharedState), &found_pg_tracing);
+	shared_spans = ShmemInitStruct("PgTracing Spans",
+								   sizeof(pgTracingSpans) + pg_tracing_max_span * sizeof(Span),
+								   &found_shared_spans);
+	pg_tracing_parallel = ShmemInitStruct("PgTracing Parallel Workers Context",
+										  sizeof(pgTracingParallelWorkers) + pg_tracing_max_traced_parallel_workers * sizeof(pgTracingParallelContext),
+										  &found_parallel);
+
+	/* Initialize pg_tracing memory context */
+	pg_tracing_mem_ctx = AllocSetContextCreate(TopMemoryContext,
+											   "pg_tracing memory context",
+											   ALLOCSET_DEFAULT_SIZES);
+
+	/* First time, let's init shared state */
+	if (!found_pg_tracing)
+	{
+		pg_tracing->stats = get_empty_pg_tracing_stats();
+		SpinLockInit(&pg_tracing->mutex);
+		SpinLockInit(&pg_tracing_parallel->mutex);
+	}
+	if (!found_shared_spans)
+	{
+		shared_spans->end = 0;
+		shared_spans->max = pg_tracing_max_span;
+	}
+	if (!found_parallel)
+	{
+		for (int i = 0; i < pg_tracing_max_traced_parallel_workers; i++)
+			pg_tracing_parallel->trace_contexts[i].leader_backend_id = InvalidBackendId;
+	}
+	LWLockRelease(AddinShmemInitLock);
+}
+
+/*
+ * Add span to the shared memory. Mutex must be acquired beforehand.
+ */
+static void
+add_span_to_shared_buffer_locked(const Span * span)
+{
+	/* Spans must be ended before adding them to the shared buffer */
+	Assert(span->ended);
+	if (shared_spans->end + 1 >= shared_spans->max)
+		pg_tracing->stats.dropped_spans++;
+	else
+	{
+		pg_tracing->stats.spans++;
+		shared_spans->spans[shared_spans->end++] = *span;
+	}
+}
+
+/*
+ * Check if we still have available space in the shared spans.
+ *
+ * Between the moment we check and the moment we want to insert, the buffer
+ * may be full but we redo a check before appending the span. This is done
+ * early when starting a top span to bail out early if the buffer is
+ * already full since we don't immediately add the span in the shared buffer.
+ */
+static bool
+check_full_shared_spans()
+{
+	if (shared_spans->end + 1 >= shared_spans->max)
+	{
+		volatile	pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing;
+		bool		full_buffer = false;
+
+		SpinLockAcquire(&s->mutex);
+		if (pg_tracing_drop_full_buffer)
+		{
+			s->stats.dropped_spans += shared_spans->end;
+			shared_spans->end = 0;
+			s->extent = 0;
+		}
+		else
+		{
+			full_buffer = true;
+			s->stats.dropped_spans++;
+		}
+		SpinLockRelease(&s->mutex);
+		return full_buffer;
+	}
+	return false;
+}
+
+/*
+ * Process a query descriptor: Gather all query instrumentation in the query
+ * span counters and generate span nodes from queryDesc planstate
+ */
+static void
+process_query_desc(QueryDesc *queryDesc, int sql_error_code)
+{
+	NodeCounters *node_counters = &get_top_span_for_nested_level(exec_nested_level)->node_counters;
+
+	/* Process total counters */
+	if (queryDesc->totaltime)
+	{
+		InstrEndLoop(queryDesc->totaltime);
+		node_counters->buffer_usage = queryDesc->totaltime->bufusage;
+		node_counters->wal_usage = queryDesc->totaltime->walusage;
+	}
+	/* Process JIT counter */
+	if (queryDesc->estate->es_jit)
+		node_counters->jit_usage = queryDesc->estate->es_jit->instr;
+	node_counters->rows = queryDesc->estate->es_total_processed;
+
+	/* Process planstate */
+	if (queryDesc->planstate)
+	{
+		Bitmapset  *rels_used = NULL;
+		planstateTraceContext planstateTraceContext;
+		uint64		parent_id = per_level_buffers[exec_nested_level].executor_id;
+
+		/* Prepare the planstate context for deparsing */
+		planstateTraceContext.rtable_names = select_rtable_names_for_explain(queryDesc->plannedstmt->rtable, rels_used);
+		planstateTraceContext.trace = &current_trace;
+		planstateTraceContext.ancestors = NULL;
+		planstateTraceContext.sql_error_code = sql_error_code;
+		planstateTraceContext.deparse_ctx = deparse_context_for_plan_tree(queryDesc->plannedstmt, planstateTraceContext.rtable_names);
+
+		generate_span_from_planstate(queryDesc->planstate, &planstateTraceContext, parent_id);
+	}
+}
+
+/*
+ * If trace was started from the global sample rate without
+ * a parent trace, we need to generate a random trace id
+ */
+static void
+set_traceparent_trace_id(struct pgTracingTrace *traceparent_parameter)
+{
+	if (traceparent_parameter->trace_id != 0)
+		return;
+
+	/*
+	 * We leave parent_id to 0 as a way to indicate that this is a standalone
+	 * trace. The top span created with a parent_id == 0 will have trace_id ==
+	 * parent_id == span_id.
+	 */
+	Assert(traceparent_parameter->parent_id == 0);
+	traceparent_parameter->trace_id = pg_prng_uint64(&pg_global_prng_state);
+}
+
+/*
+ * Decide whether a query should be sampled depending on the traceparent sampled
+ * and the provided sample rate configurations
+ */
+static bool
+apply_sample_rate(struct pgTracingTrace const *traceparent_parameter)
+{
+	double		rand;
+	bool		sampled;
+
+	/* Everything is sampled */
+	if (pg_tracing_sample_rate >= 1.0)
+		return true;
+
+	/* No SQLCommenter sampled and no global sample rate */
+	if (traceparent_parameter->sampled == 0 && pg_tracing_sample_rate == 0.0)
+		return false;
+
+	/* SQLCommenter sampled caller sample */
+	if (traceparent_parameter->sampled == 1 && pg_tracing_caller_sample_rate >= 1.0)
+		return true;
+
+	/*
+	 * We have a non zero global sample rate or a sampled flag. Either way, we
+	 * need a rand value
+	 */
+	rand = pg_prng_double(&pg_global_prng_state);
+	if (traceparent_parameter->sampled == 1)
+		/* Sampled flag case */
+		sampled = (rand < pg_tracing_caller_sample_rate);
+	else
+		/* Global rate case */
+		sampled = (rand < pg_tracing_sample_rate);
+	return sampled;
+}
+
+/*
+ * If the query was started as a prepared statement, we won't be able to
+ * extract traceparent during query parsing since parsing was skipped.
+ *
+ * We assume that SQLCommenter content can be passed as a text in the
+ * first parameter.
+ */
+static void
+check_traceparameter_in_parameter(ParamListInfo params)
+{
+	Oid			typoutput;
+	bool		typisvarlena;
+	char	   *pstring;
+	ParamExternData param;
+
+	if (params == NULL || params->numParams == 0)
+		return;
+
+	param = params->params[0];
+	/* Check first parameter type */
+	if (param.ptype != TEXTOID)
+		return;
+
+	/* Get the text of the first string parameter */
+	getTypeOutputInfo(param.ptype, &typoutput, &typisvarlena);
+	pstring = OidOutputFunctionCall(typoutput, param.value);
+	extract_traceparent(&current_trace, pstring, true);
+}
+
+/*
+ * Check whether the current query should be sampled or not
+ */
+static void
+check_query_sampled(ParseState *pstate, const ParamListInfo params)
+{
+	TimestampTz statement_start_ts;
+
+	/* Safety check... */
+	if (!pg_tracing || !pg_tracing_enabled(exec_nested_level))
+		return;
+
+	/* sampling already started */
+	if (current_trace.sampled > 0)
+		return;
+
+	/* Don't start tracing if we're not at the top */
+	if (exec_nested_level > 0)
+		return;
+
+	/*
+	 * In a parallel worker, check the parallel context shared buffer to see
+	 * if the leader left a trace context
+	 */
+	if (IsParallelWorker())
+	{
+		fetch_parallel_context(&current_trace);
+		return;
+	}
+
+	Assert(max_nested_level = -1);
+
+	current_trace.trace_id = 0;
+	current_trace.parent_id = 0;
+	if (params != NULL)
+		check_traceparameter_in_parameter(params);
+	else if (pstate != NULL)
+		extract_traceparent(&current_trace, pstate->p_sourcetext, false);
+
+	/*
+	 * A statement can go through this function 3 times: post parsing, planner
+	 * and executor run. If no sampling flag was extracted from SQLCommenter
+	 * and we've already seen this statement (using statement_start ts), don't
+	 * try to apply sample rate and exit.
+	 */
+	statement_start_ts = GetCurrentStatementStartTimestamp();
+	if (current_trace.sampled == 0 && last_statement_check_for_sampling == statement_start_ts)
+		return;
+	/* First time we see this statement, save the time */
+	last_statement_check_for_sampling = statement_start_ts;
+
+	current_trace.sampled = apply_sample_rate(&current_trace);
+
+	if (current_trace.sampled > 0 && check_full_shared_spans())
+		/* Buffer is full, abort sampling */
+		current_trace.sampled = 0;
+
+	if (current_trace.sampled > 0)
+		set_traceparent_trace_id(&current_trace);
+}
+
+/*
+ * Adjust span's offsets with the provided file offset
+ */
+static void
+adjust_file_offset(Span * span, Size file_position)
+{
+	if (span->name_offset != -1)
+		span->name_offset += file_position;
+	if (span->operation_name_offset != -1)
+		span->operation_name_offset += file_position;
+	if (span->parameter_offset != -1)
+		span->parameter_offset += file_position;
+}
+
+/*
+ * Get the start time for a new top span
+ */
+static int64
+get_top_span_start(int64 *start_time_ns)
+{
+	if (exec_nested_level == 0)
+	{
+		/* For the top span of parallel worker, we get the current ns */
+		if (IsParallelWorker())
+		{
+			if (start_time_ns != NULL)
+				return *start_time_ns;
+			return get_current_ns();
+		}
+		/* For the root top span, use the trace start */
+		Assert(current_trace.start_trace.ns > 0);
+		return current_trace.start_trace.ns;
+	}
+
+	/*
+	 * We're in a nested query, if nested_query_start is not set, we haven't
+	 * gone through ProcessUtility. Set the start at the current time in this
+	 * case.
+	 */
+	if (nested_query_start_ns == 0)
+	{
+		if (start_time_ns != NULL)
+			return *start_time_ns;
+		return get_current_ns();
+	}
+
+	/*
+	 * nested_query_start is available, use it
+	 */
+	return nested_query_start_ns;
+}
+
+/*
+ * Reset pg_tracing memory context and global state.
+ */
+static void
+cleanup_tracing(void)
+{
+	if (current_trace.sampled == 0)
+		/* No need for cleaning */
+		return;
+	remove_parallel_context();
+	MemoryContextReset(pg_tracing_mem_ctx);
+	current_trace.sampled = 0;
+	current_trace.trace_id = 0;
+	current_trace.parent_id = 0;
+	max_nested_level = -1;
+	current_trace_spans = NULL;
+	per_level_buffers = NULL;
+}
+
+/*
+ * End the query tracing and dump all spans in the shared buffer.
+ * This may happen either when query is finished or on a caught error.
+ */
+static void
+end_tracing(const int64 *end_span_ns)
+{
+	Size		file_position = 0;
+
+	/* We're still a nested query, tracing is not finished */
+	if (exec_nested_level > 0)
+		return;
+
+	/* Dump all buffered texts in file */
+	text_store_file(pg_tracing, current_trace_text->data, current_trace_text->len, &file_position);
+
+	for (int i = 0; i < current_trace_spans->end; i++)
+		adjust_file_offset(current_trace_spans->spans + i, file_position);
+
+	{
+		volatile	pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing;
+
+		SpinLockAcquire(&s->mutex);
+		/* We're at the end, add all spans to the shared memory */
+		for (int i = 0; i < current_trace_spans->end; i++)
+			add_span_to_shared_buffer_locked(&current_trace_spans->spans[i]);
+		SpinLockRelease(&s->mutex);
+	}
+
+	/* We can reset the memory context here */
+	cleanup_tracing();
+}
+
+/*
+ * When we catch an error (timeout, cancel query), we need to flag the ongoing
+ * span with an error, send current spans in the shared buffer and clean
+ * our memory context
+ */
+static void
+handle_pg_error(int span_index, QueryDesc *queryDesc)
+{
+	Span	   *span;
+	Span	   *top_span;
+	int64		end_span_ns;
+	int			sql_error_code;
+
+	/* If we're not sampling the query, bail out */
+	if (current_trace.sampled == 0 && !pg_tracing_enabled(exec_nested_level))
+		return;
+	sql_error_code = geterrcode();
+
+	/*
+	 * Order matters there. We want to process query desc before getting the
+	 * end time. Otherwise, the span nodes will have a higher duration than
+	 * their parents.
+	 */
+	if (queryDesc != NULL)
+		process_query_desc(queryDesc, sql_error_code);
+
+	/* Assign the error code to the latest span */
+	span = get_span_from_index(span_index);
+	span->sql_error_code = sql_error_code;
+
+	/* Assign the error code to the latest top span */
+	top_span = get_top_span_for_nested_level(exec_nested_level);
+	top_span->sql_error_code = sql_error_code;
+
+	end_span_ns = get_current_ns();
+	/* End current span, ongoing top spans and tracing */
+	for (int i = 0; i <= exec_nested_level; i++)
+		end_top_span(i, end_span_ns);
+	end_span(span, &end_span_ns);
+	end_tracing(&end_span_ns);
+}
+
+/*
+ * Get the parent id for the given nested level
+ *
+ * nested_level can be negative when creating the root top span.
+ * In this case, we use the parent id from the propagated trace context.
+ */
+static uint64
+get_parent_id(int nested_level)
+{
+	Span	   *top_span;
+
+	if (nested_level < 0)
+		return current_trace.parent_id;
+	Assert(nested_level <= allocated_nested_level);
+	top_span = get_top_span_for_nested_level(nested_level);
+	Assert(top_span->span_id > 0);
+	return top_span->span_id;
+}
+
+/*
+ * If we enter a new nested level, initialize all necessary buffers
+ * and start_trace timer.
+ *
+ * The start of a top span can vary: prepared statement will skip parsing, the use of cached
+ * plans will skip the planner hook.
+ * Thus, a top span can start in either post parse, planner hook or executor run.
+ *
+ * Since this is called after the we've detected the start of a trace, we check for available
+ * space in the buffer. If the buffer is full, we abort tracing by setting sampled to false.
+ * Callers need to check that tracing was aborted.
+ */
+static bool
+initialize_trace_level(void)
+{
+	/* Check if we've already created a top span for this nested level */
+	if (exec_nested_level <= max_nested_level)
+		return false;
+
+	/* First time */
+	if (max_nested_level == -1)
+	{
+		MemoryContext oldcxt;
+
+		Assert(pg_tracing_mem_ctx->isReset);
+
+		/*
+		 * We need to be able to pass information that depends on the nested
+		 * level.
+		 *
+		 * executor span ids: Since an executor run becomes the parent span,
+		 * we need subsequent created node spans to have the correct parent.
+		 *
+		 * query ids: used to propagate queryId to all spans in the same level
+		 *
+		 * top spans: We create one top span per nested level and those are
+		 * only inserted in the shared buffer at the end
+		 */
+		oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx);
+		/* initial allocation */
+		allocated_nested_level = 1;
+		per_level_buffers = palloc0(allocated_nested_level * sizeof(pgTracingPerLevelBuffer));
+		/* 0 is a valid index, initialise all indexes to -1 */
+		for (int i = 0; i < allocated_nested_level; i++)
+			per_level_buffers[i].top_span_index = -1;
+		current_trace_spans = palloc0(pg_tracing_initial_allocated_spans * sizeof(Span));
+		current_trace_spans->max = pg_tracing_initial_allocated_spans;
+		current_trace_text = makeStringInfo();
+
+		MemoryContextSwitchTo(oldcxt);
+
+		/*
+		 * Parallel workers will get the referential timers from the leader
+		 * parallel context
+		 */
+		if (!IsParallelWorker())
+		{
+			/* Start referential timers */
+			current_trace.start_trace.ts = GetCurrentStatementStartTimestamp();
+
+			/*
+			 * We can't get the monotonic clock at the start of the statement
+			 * so let's estimate it here.
+			 */
+			current_trace.start_trace.ns = get_current_ns() - (GetCurrentTimestamp() - current_trace.start_trace.ts) * NS_PER_US;
+		}
+	}
+	else if (exec_nested_level >= allocated_nested_level)
+	{
+		/* New nested level, allocate more memory */
+		MemoryContext oldcxt;
+		int			old_allocated_nested_level = allocated_nested_level;
+
+		oldcxt = MemoryContextSwitchTo(pg_tracing_mem_ctx);
+		allocated_nested_level++;
+		per_level_buffers = repalloc0(per_level_buffers, old_allocated_nested_level * sizeof(pgTracingPerLevelBuffer), allocated_nested_level * sizeof(pgTracingPerLevelBuffer));
+		for (int i = old_allocated_nested_level; i < allocated_nested_level; i++)
+			per_level_buffers[i].top_span_index = -1;
+		MemoryContextSwitchTo(oldcxt);
+	}
+
+	max_nested_level = exec_nested_level;
+	if (exec_nested_level == 0)
+	{
+		/* Start of a new trace */
+		volatile	pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing;
+
+		SpinLockAcquire(&s->mutex);
+		s->stats.traces++;
+		SpinLockRelease(&s->mutex);
+	}
+	return true;
+}
+
+/*
+ * Add a new string to the current_trace_text stringinfo
+ */
+static int
+add_str_to_trace_buffer(const char *str, int str_len)
+{
+	int			position = current_trace_text->cursor;
+
+	appendBinaryStringInfo(current_trace_text, str, str_len);
+	appendStringInfoChar(current_trace_text, '\0');
+	current_trace_text->cursor = current_trace_text->len;
+	return position;
+}
+
+/*
+ * Add the worker name to the current_trace_text stringinfo
+ */
+static int
+add_worker_name_to_trace_buffer()
+{
+	int			position = current_trace_text->cursor;
+
+	appendStringInfo(current_trace_text, "Worker %d", ParallelWorkerNumber);
+	appendStringInfoChar(current_trace_text, '\0');
+	current_trace_text->cursor = current_trace_text->len;
+	return position;
+}
+
+/*
+ * Start a new top span if we've entered a new nested level
+ */
+static void
+begin_top_span(Span * top_span, CmdType commandType,
+			   const Query *query, const JumbleState *jstate,
+			   const char *query_text, int64 start_time_ns)
+{
+	int			query_len;
+	const char *normalised_query;
+
+	/* in case of a cached plan, query might be unavailable */
+	if (query != NULL)
+		per_level_buffers[exec_nested_level].query_id = query->queryId;
+	else if (current_trace.query_id > 0)
+		per_level_buffers[exec_nested_level].query_id = current_trace.query_id;
+
+	begin_span(top_span, command_type_to_span_type(commandType),
+			   &current_trace, get_parent_id(exec_nested_level - 1), per_level_buffers[exec_nested_level].query_id, &start_time_ns, exec_nested_level);
+	top_span->is_top_span = true;
+
+	if (IsParallelWorker())
+	{
+		/*
+		 * In a parallel worker, we use the worker name as the span's
+		 * operation
+		 */
+		top_span->operation_name_offset = add_worker_name_to_trace_buffer();
+		return;
+	}
+
+	if (jstate && jstate->clocations_count > 0 && query != NULL)
+	{
+		/* jstate is available, normalise query and extract parameters' values */
+		char	   *paramStr;
+
+		query_len = query->stmt_len;
+		normalised_query = normalise_query_parameters(jstate, query_text,
+													  query->stmt_location,
+													  &query_len, &paramStr);
+		top_span->parameter_offset = add_str_to_trace_buffer(paramStr, strlen(paramStr));
+	}
+	else
+	{
+		/*
+		 * No jstate available, normalise query but we won't be able to
+		 * extract parameters
+		 */
+		int			stmt_location;
+
+		if (query != NULL && query->stmt_len > 0)
+		{
+			query_len = query->stmt_len;
+			stmt_location = query->stmt_location;
+		}
+		else
+		{
+			query_len = strlen(query_text);
+			stmt_location = 0;
+		}
+		normalised_query = normalise_query(query_text, stmt_location, &query_len);
+	}
+	top_span->operation_name_offset = add_str_to_trace_buffer(normalised_query, query_len);
+}
+
+/*
+ * Initialise buffer if we are in a new nested level and start top span.
+ * If the top span already exists for the current nested level, this has no effect.
+ */
+static int64
+initialize_trace_level_and_top_span(CmdType commandType, Query *query, JumbleState *jstate, const char *query_text, int64 *start_time_ns)
+{
+	bool		new_nested_level = initialize_trace_level();
+	int64		top_span_start_ns = get_top_span_start(start_time_ns);
+	Span	   *current_top = get_top_span_for_nested_level(exec_nested_level);
+
+	/* current_trace_spans buffer should have been allocated */
+	Assert(current_trace_spans != NULL);
+	if (new_nested_level)
+	{
+		begin_top_span(current_top, commandType, query, jstate, query_text, top_span_start_ns);
+	}
+	else if (current_top->ended == true)
+	{
+		/*
+		 * It's possible to have multiple top spans within a nested query. If
+		 * the previous top span was closed, start a new one
+		 */
+		int			span_index = get_index_from_trace_spans();
+		Span	   *span = get_span_from_index(span_index);
+
+		set_top_span(exec_nested_level, span_index);
+		begin_top_span(span, commandType, query, jstate, query_text, top_span_start_ns);
+	}
+	return top_span_start_ns;
+}
+
+/*
+ * Post-parse-analyze hook
+ *
+ * Tracing can be started here if:
+ * - The query has a SQLCommenter with traceparent parameter with sampled flag on and passes the caller_sample_rate
+ * - The query passes the sample_rate
+ *
+ * If query is sampled, create a post parse span and start the top span.
+ */
+static void
+pg_tracing_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
+{
+	Span	   *post_parse_span;
+	int			post_parse_index = -1;
+	int64		start_post_parse_ns;
+	uint64		parent_id;
+	int64		top_span_start_ns;
+
+	if (prev_post_parse_analyze_hook)
+		prev_post_parse_analyze_hook(pstate, query, jstate);
+
+	/*
+	 * In case there's an multi statement transaction using extended protocol,
+	 * we will have the PREPARE of the next statement before we go through the
+	 * ExecutorEnd of the previous statement.
+	 *
+	 * For now, we skip parse span and will only start the next statement
+	 * tracing during planner hook.
+	 *
+	 * TODO: Find a way to keep parse span for the next statement
+	 */
+	if (!pg_tracing_mem_ctx->isReset && exec_nested_level == 0)
+		return;
+
+	/* If disabled, don't trace utility statement */
+	if (query->utilityStmt && !pg_tracing_track_utility)
+		return;
+
+	/* Evaluate if query is sampled or not */
+	check_query_sampled(pstate, NULL);
+
+	if (current_trace.sampled == 0)
+		/* Query is not sampled, nothing to do */
+		return;
+
+	/*
+	 * We want to avoid calling get_ns at the start of post parse as it will
+	 * impact all queries and we will only use it when the query is sampled.
+	 */
+	start_post_parse_ns = get_current_ns();
+
+	/*
+	 * Either we're inside a nested sampled query or we've parsed a query with
+	 * the sampled flag, start a new level with a top span
+	 */
+	top_span_start_ns = initialize_trace_level_and_top_span(query->commandType, query, jstate, pstate->p_sourcetext, &start_post_parse_ns);
+
+	parent_id = get_parent_id(exec_nested_level);
+	Assert(parent_id > 0);
+
+	/*
+	 * We only create parse span when we have a good idea of when the parse
+	 * start. For nested queries created from the queryDesc
+	 * (nested_query_start_ns == 0), we can't get a reliable time for the
+	 * parse start. We don't create Parse span in this case for now.
+	 */
+	if (exec_nested_level == 0 || nested_query_start_ns > 0)
+	{
+		Span	   *parse_span = NULL;
+
+		/* We don't have a precise time for parse end, estimate it */
+		int64		end_parse_ns = start_post_parse_ns - 1000;
+		int			parse_index = get_index_from_trace_spans();
+
+		parse_span = get_span_from_index(parse_index);
+		begin_span(parse_span, SPAN_PARSE,
+				   &current_trace, parent_id, per_level_buffers[exec_nested_level].query_id,
+				   &top_span_start_ns, exec_nested_level);
+		end_span(parse_span, &end_parse_ns);
+	}
+
+	/* Post parse span */
+	post_parse_index = get_index_from_trace_spans();
+	post_parse_span = get_span_from_index(post_parse_index);
+	begin_span(post_parse_span, SPAN_POST_PARSE,
+			   &current_trace,
+			   parent_id, per_level_buffers[exec_nested_level].query_id,
+			   &start_post_parse_ns, exec_nested_level);
+	end_span(post_parse_span, NULL);
+}
+
+/*
+ * Planner hook
+ * Tracing can start here if the query skipped parsing (prepared query) and passes the random sample_rate.
+ * If query is sampled, create a plan span and start the top span if not already started.
+ */
+static PlannedStmt *
+pg_tracing_planner_hook(Query *query,
+						const char *query_string,
+						int cursorOptions,
+						ParamListInfo params)
+{
+	PlannedStmt *result;
+	int			span_planner_index = -1;
+	Span	   *span_planner;
+
+	/* Evaluate if query is sampled or not */
+	check_query_sampled(NULL, params);
+
+	if (current_trace.sampled == 0 || !pg_tracing_enabled(plan_nested_level + exec_nested_level))
+	{
+		/* No sampling */
+		if (prev_planner_hook)
+			result = prev_planner_hook(query, query_string, cursorOptions,
+									   params);
+		else
+			result = standard_planner(query, query_string, cursorOptions,
+									  params);
+		return result;
+	}
+
+	/*
+	 * We may have skipped parsing if statement was prepared, create a new top
+	 * span in this case.
+	 */
+	if (current_trace.sampled == 1 && pg_tracing_enabled(plan_nested_level + exec_nested_level))
+		initialize_trace_level_and_top_span(query->commandType, query, NULL, query_string, NULL);
+
+	/* Start planner span */
+	span_planner_index = get_index_from_trace_spans();
+	span_planner = get_span_from_index(span_planner_index);
+	begin_span(span_planner, SPAN_PLANNER, &current_trace,
+			   get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level);
+
+	plan_nested_level++;
+	PG_TRY();
+	{
+		if (prev_planner_hook)
+			result = prev_planner_hook(query, query_string, cursorOptions,
+									   params);
+		else
+			result = standard_planner(query, query_string, cursorOptions,
+									  params);
+	}
+	PG_CATCH();
+	{
+		plan_nested_level--;
+		handle_pg_error(span_planner_index, NULL);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+	plan_nested_level--;
+
+	/* End planner span */
+	end_span_index(span_planner_index, NULL);
+
+	/* If we have a prepared statement, add bound parameters to the top span */
+	if (params != NULL)
+	{
+		char	   *paramStr = BuildParamLogString(params,
+												   NULL, pg_tracing_max_parameter_str);
+		Span	   *current_top = get_top_span_for_nested_level(exec_nested_level);
+
+		Assert(current_top->parameter_offset == -1);
+		if (paramStr != NULL)
+			current_top->parameter_offset = add_str_to_trace_buffer(paramStr, strlen(paramStr));
+	}
+	return result;
+}
+
+/*
+ * ExecutorStart hook: Activate query instrumentation if query is sampled
+ * Tracing can be started here if the query used a cached plan and passes the random sample_rate
+ * If query is sampled, start the top span if not already started.
+ */
+static void
+pg_tracing_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+	int			executor_start_span_index = -1;
+	Span	   *executor_span;
+	int			instrument_options = INSTRUMENT_TIMER | INSTRUMENT_ROWS;
+
+	/* Evaluate if query is sampled or not */
+	check_query_sampled(NULL, queryDesc->params);
+
+	if (pg_tracing_instrument_buffers)
+		instrument_options |= INSTRUMENT_BUFFERS;
+	if (pg_tracing_instrument_wal)
+		instrument_options |= INSTRUMENT_WAL;
+
+	if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level))
+	{
+		/*
+		 * In case of a cached plan, we haven't gone through neither parsing
+		 * nor planner hook. Parallel workers will also start tracing from
+		 * there. Create the top case in this case.
+		 */
+		initialize_trace_level_and_top_span(queryDesc->operation, NULL, NULL, queryDesc->sourceText, NULL);
+
+		/* Start executorStart Span */
+		executor_start_span_index = get_index_from_trace_spans();
+		executor_span = get_span_from_index(executor_start_span_index);
+		begin_span(executor_span, SPAN_EXECUTOR_START, &current_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level);
+
+		/*
+		 * Activate query instrumentation. Timer and rows instrumentation are
+		 * mandatory.
+		 */
+		queryDesc->instrument_options = instrument_options;
+	}
+
+	if (prev_ExecutorStart)
+		prev_ExecutorStart(queryDesc, eflags);
+	else
+		standard_ExecutorStart(queryDesc, eflags);
+
+	if (executor_start_span_index != -1)
+	{
+		/* Allocate instrumentation in the per-query context */
+		if (queryDesc->totaltime == NULL)
+		{
+			MemoryContext oldcxt;
+
+			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+			queryDesc->totaltime = InstrAlloc(1, instrument_options, false);
+			queryDesc->planstate->instrument = InstrAlloc(1, instrument_options, false);
+			MemoryContextSwitchTo(oldcxt);
+		}
+		end_span_index(executor_start_span_index, NULL);
+	}
+}
+
+/*
+ * ExecutorRun hook: track nesting depth and create executor run span.
+ * If the plan needs to create parallel workers, push the trace context in the parallel shared buffer.
+ */
+static void
+pg_tracing_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction,
+					   uint64 count, bool execute_once)
+{
+	int			span_index = -1;
+
+	if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level))
+	{
+		Span	   *executor_run_span;
+
+		/* Start executor run span */
+		span_index = get_index_from_trace_spans();
+		executor_run_span = get_span_from_index(span_index);
+		begin_span(executor_run_span, SPAN_EXECUTOR_RUN, &current_trace, get_parent_id(exec_nested_level),
+				   per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level);
+
+		/*
+		 * Executor run is used as the parent's span for nodes, store its' id
+		 * in the per level buffer.
+		 */
+		per_level_buffers[exec_nested_level].executor_id = executor_run_span->span_id;
+
+		/*
+		 * If this query starts parallel worker, push the trace context for
+		 * the child processes
+		 */
+		if (queryDesc->plannedstmt->parallelModeNeeded)
+			add_parallel_context(executor_run_span->span_id, per_level_buffers[exec_nested_level].query_id);
+	}
+
+	exec_nested_level++;
+	PG_TRY();
+	{
+		if (prev_ExecutorRun)
+			prev_ExecutorRun(queryDesc, direction, count, execute_once);
+		else
+			standard_ExecutorRun(queryDesc, direction, count, execute_once);
+	}
+	PG_CATCH();
+	{
+		exec_nested_level--;
+		handle_pg_error(span_index, queryDesc);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+	exec_nested_level--;
+
+	/*
+	 * When a query is executing the pg_tracing_spans function, sampling will
+	 * be disabled while we have an ongoing trace and current_trace_spans will
+	 * be NULL. Check that span_index != -1 isn't enough in this case, we need
+	 * to check that current_trace_spans still exists.
+	 */
+	if (current_trace_spans != NULL)
+		end_span_index(span_index, NULL);
+}
+
+/*
+ * ExecutorFinish hook: create executor finish span and track nesting depth
+ */
+static void
+pg_tracing_ExecutorFinish(QueryDesc *queryDesc)
+{
+	int			span_index = -1;
+	Span	   *span = NULL;
+
+	if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level))
+	{
+		span_index = get_index_from_trace_spans();
+		span = get_span_from_index(span_index);
+		begin_span(span, SPAN_EXECUTOR_FINISH, &current_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, NULL, exec_nested_level);
+	}
+
+	exec_nested_level++;
+	PG_TRY();
+	{
+		if (prev_ExecutorFinish)
+			prev_ExecutorFinish(queryDesc);
+		else
+			standard_ExecutorFinish(queryDesc);
+	}
+	PG_CATCH();
+	{
+		exec_nested_level--;
+		handle_pg_error(span_index, queryDesc);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+	exec_nested_level--;
+
+	end_span_index(span_index, NULL);
+}
+
+/*
+ * ExecutorEnd hook:
+ * - process queryDesc to generate span from planstate
+ * - end top span for the current nested level
+ * - end tracing if we're at the root nested level
+ */
+static void
+pg_tracing_ExecutorEnd(QueryDesc *queryDesc)
+{
+	int			executor_end_index = -1;
+
+	if (current_trace.sampled > 0 && pg_tracing_enabled(exec_nested_level))
+	{
+		Span	   *executor_end_span;
+
+		/* Create executor end span */
+		executor_end_index = get_index_from_trace_spans();
+		executor_end_span = get_span_from_index(executor_end_index);
+		begin_span(executor_end_span, SPAN_EXECUTOR_END, &current_trace, get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id,
+				   NULL, exec_nested_level);
+
+		/* Query finished normally, send 0 as error code */
+		process_query_desc(queryDesc, 0);
+	}
+
+	if (prev_ExecutorEnd)
+		prev_ExecutorEnd(queryDesc);
+	else
+		standard_ExecutorEnd(queryDesc);
+
+	if (executor_end_index > -1)
+	{
+		int64		end_span_ns = get_current_ns();
+
+		/* End top span, executor_end span and tracing */
+		end_top_span(exec_nested_level, end_span_ns);
+		end_span_index(executor_end_index, &end_span_ns);
+		end_tracing(&end_span_ns);
+	}
+}
+
+/*
+ * ProcessUtility hook
+ *
+ * Trace utility query if utility tracking is enabled and sampling was enabled
+ * during parse step.
+ */
+static void
+pg_tracing_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
+						  bool readOnlyTree,
+						  ProcessUtilityContext context,
+						  ParamListInfo params, QueryEnvironment *queryEnv,
+						  DestReceiver *dest, QueryCompletion *qc)
+{
+	Span	   *process_utility_span;
+	Span	   *previous_top_span;
+	int			process_utility_span_index = -1;
+	int			previous_top_span_index = -1;
+	int64		end_span_ns;
+	int64		start_span_ns;
+
+	if (!pg_tracing_track_utility || current_trace.sampled == 0 || !pg_tracing_enabled(exec_nested_level))
+	{
+		if (prev_ProcessUtility)
+			prev_ProcessUtility(pstmt, queryString, readOnlyTree,
+								context, params, queryEnv,
+								dest, qc);
+		else
+			standard_ProcessUtility(pstmt, queryString, readOnlyTree,
+									context, params, queryEnv,
+									dest, qc);
+
+		/*
+		 * Tracing may have been started within the utility call without going
+		 * through ExecutorEnd (ex: prepare statement). Check and end tracing
+		 * here in this case.
+		 */
+		if (!pg_tracing_mem_ctx->isReset)
+		{
+			Assert(current_trace.sampled);
+			Assert(exec_nested_level == 0);
+			end_tracing(NULL);
+		}
+		return;
+	}
+
+	start_span_ns = get_current_ns();
+
+	/*
+	 * Set a possible start for nested query. ProcessUtility may start a
+	 * nested query and we can't use the statement start like we usually do.
+	 * We artificially start nested 1us later to avoid overlapping spans.
+	 */
+	nested_query_start_ns = start_span_ns + 1000;
+
+	/* Build the process utility span. */
+	process_utility_span_index = get_index_from_trace_spans();
+	process_utility_span = get_span_from_index(process_utility_span_index);
+	begin_span(process_utility_span, SPAN_PROCESS_UTILITY, &current_trace,
+			   get_parent_id(exec_nested_level), per_level_buffers[exec_nested_level].query_id, &start_span_ns, exec_nested_level);
+
+	/* Extract and normalise the query */
+	if (queryString != NULL && pstmt->stmt_location != -1 && pstmt->stmt_len > 0)
+	{
+		int			query_len = pstmt->stmt_len;
+		const char *normalised_query = normalise_query(queryString, pstmt->stmt_location, &query_len);
+
+		process_utility_span->operation_name_offset = add_str_to_trace_buffer(normalised_query, query_len);
+	}
+
+	/*
+	 * Set ProcessUtility span as the new top span for the nested queries that
+	 * may be created. We need to keep the old top span index to end it at the
+	 * end of the process utility hook.
+	 */
+	previous_top_span_index = per_level_buffers[exec_nested_level].top_span_index;
+	set_top_span(exec_nested_level, process_utility_span_index);
+
+	exec_nested_level++;
+	PG_TRY();
+	{
+		if (prev_ProcessUtility)
+			prev_ProcessUtility(pstmt, queryString, readOnlyTree,
+								context, params, queryEnv,
+								dest, qc);
+		else
+			standard_ProcessUtility(pstmt, queryString, readOnlyTree,
+									context, params, queryEnv,
+									dest, qc);
+	}
+	PG_CATCH();
+	{
+		exec_nested_level--;
+		handle_pg_error(previous_top_span_index, NULL);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+	end_span_ns = get_current_ns();
+
+	/* ProcessUtility may have created a top span, end it. */
+	if (exec_nested_level <= max_nested_level && per_level_buffers[exec_nested_level].top_span_index != -1)
+	{
+		Span	   *top_span = get_top_span_for_nested_level(exec_nested_level);
+
+		if (top_span->ended == false)
+			end_top_span(exec_nested_level, end_span_ns);
+	}
+	exec_nested_level--;
+
+	/*
+	 * buffer may have been repalloced, refresh the process_utility_span
+	 * variable
+	 */
+	process_utility_span = get_span_from_index(process_utility_span_index);
+	if (qc != NULL)
+		process_utility_span->node_counters.rows = qc->nprocessed;
+	/* ProcessUtility is the current top span, no need to call end_top_span */
+	end_span(process_utility_span, &end_span_ns);
+
+	/* Also end the previous top span */
+	previous_top_span = get_span_from_index(previous_top_span_index);
+	end_span(previous_top_span, &end_span_ns);
+
+	Assert(previous_top_span->start < process_utility_span->start);
+	Assert(get_span_end_ns(previous_top_span) >= get_span_end_ns(process_utility_span));
+
+	end_tracing(&end_span_ns);
+
+	/*
+	 * We may iterate through a list of nested ProcessUtility calls, update
+	 * the nested query start.
+	 */
+	nested_query_start_ns = end_span_ns + 1000;
+}
+
+/*
+ * Iterate over a list of planstate to generate span node
+ */
+static void
+generate_member_nodes(PlanState **planstates, int nplans, planstateTraceContext * planstateTraceContext, uint64 parent_id)
+{
+	int			j;
+
+	for (j = 0; j < nplans; j++)
+		generate_span_from_planstate(planstates[j], planstateTraceContext, parent_id);
+}
+
+/*
+ * Iterate over custom scan planstates to generate span node
+ */
+static void
+generate_span_from_custom_scan(CustomScanState *css, planstateTraceContext * planstateTraceContext, uint64 parent_id)
+{
+	ListCell   *cell;
+
+	foreach(cell, css->custom_ps)
+		generate_span_from_planstate((PlanState *) lfirst(cell), planstateTraceContext, parent_id);
+}
+
+/*
+ * With nested queries, the post parse hook will be called before we know
+ * the top span parent. We will only know and the parent when we generate span nodes from planstate.
+ *
+ * When those nodes are created, we attach the top span.
+ */
+static void
+attach_top_span_to_new_parent(Span const *new_parent, int64 parent_end_ns_time)
+{
+	Span	   *child_top_span;
+	int64		child_end_ns_time;
+
+	if (exec_nested_level + 1 > max_nested_level)
+		/* No child to attach */
+		return;
+
+	child_top_span = get_top_span_for_nested_level(exec_nested_level + 1);
+	Assert(child_top_span->start_ns_time > 0);
+	/* If child start after parent's end, don't link it */
+	if (parent_end_ns_time < child_top_span->start_ns_time)
+		return;
+	child_end_ns_time = child_top_span->start_ns_time + child_top_span->duration_ns;
+	/* If child ends before parent's start, don't link it */
+	if (child_end_ns_time < new_parent->start_ns_time)
+		return;
+
+	/* Child span should already have ended */
+	Assert(child_top_span->ended == true);
+	/* Link this child to the new parent */
+	child_top_span->parent_id = new_parent->span_id;
+}
+
+/*
+ * Create span node for the provided planstate
+ */
+static uint64
+create_span_node(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id,
+				 SpanType span_type, char *subplan_name)
+{
+	Span	   *span;
+	int			span_index = -1;
+	char const *span_name;
+	char const *operation_name;
+	Plan const *plan = planstate->plan;
+	int64		span_end_ns;
+	int64		start_span_ns;
+
+	/*
+	 * Make sure stats accumulation is done. Function is a no-op if if was
+	 * already done.
+	 */
+	InstrEndLoop(planstate->instrument);
+
+	/* We only create span node on node that were executed */
+	Assert(!INSTR_TIME_IS_ZERO(planstate->instrument->firsttime));
+
+	start_span_ns = INSTR_TIME_GET_NANOSEC(planstate->instrument->firsttime);
+	span_index = get_index_from_trace_spans();
+	span = get_span_from_index(span_index);
+
+	begin_span(span, span_type, planstateTraceContext->trace, parent_id, per_level_buffers[exec_nested_level].query_id,
+			   &start_span_ns, exec_nested_level);
+
+	/* first tuple time */
+	span->startup = planstate->instrument->startup * NS_PER_S;
+
+	if (span_type == SPAN_NODE)
+	{
+		/* Generate names and store them */
+		span_name = plan_to_span_name(plan);
+		operation_name = plan_to_operation(planstateTraceContext, planstate, span_name);
+
+		span->name_offset = add_str_to_trace_buffer(span_name, strlen(span_name));
+		span->operation_name_offset = add_str_to_trace_buffer(operation_name, strlen(operation_name));
+	}
+	else if (subplan_name != NULL)
+	{
+		span->operation_name_offset = add_str_to_trace_buffer(subplan_name, strlen(subplan_name));
+	}
+
+	span->node_counters.rows = (int64) planstate->instrument->ntuples / planstate->instrument->nloops;
+	span->node_counters.nloops = (int64) planstate->instrument->nloops;
+	span->node_counters.buffer_usage = planstate->instrument->bufusage;
+	span->node_counters.wal_usage = planstate->instrument->walusage;
+
+	span->plan_counters.startup_cost = plan->startup_cost;
+	span->plan_counters.total_cost = plan->total_cost;
+	span->plan_counters.plan_rows = plan->plan_rows;
+	span->plan_counters.plan_width = plan->plan_width;
+
+	if (!planstate->state->es_finished)
+	{
+		/*
+		 * If the query is in an unfinished state, it means that we're in an
+		 * error handler. Stop the node instrumentation to get the latest
+		 * known state.
+		 */
+		InstrStopNode(planstate->instrument, planstate->state->es_processed);
+		InstrEndLoop(planstate->instrument);
+		span->sql_error_code = planstateTraceContext->sql_error_code;
+	}
+	span_end_ns = INSTR_TIME_GET_NANOSEC(planstate->instrument->firsttime);
+	Assert(planstate->instrument->total > 0);
+	span_end_ns += planstate->instrument->total * NS_PER_S;
+	end_span(span, &span_end_ns);
+
+	/*
+	 * If we have a Result node, make it the span parent of the next query
+	 * span if we have any
+	 *
+	 * TODO: Do we have other node that could be the base for nested queries?
+	 */
+	if (nodeTag(plan) == T_Result || nodeTag(plan) == T_ProjectSet)
+		attach_top_span_to_new_parent(span, span_end_ns);
+	return span->span_id;
+}
+
+/*
+ * Walk through the planstate tree generating a node span for each node.
+ * We pass possible error code to tag unfinished node with it
+ */
+static void
+generate_span_from_planstate(PlanState *planstate, planstateTraceContext * planstateTraceContext, uint64 parent_id)
+{
+	ListCell   *l;
+	Plan const *plan = planstate->plan;
+	uint64		span_id;
+
+	/* The node was never executed, skip it */
+	if (INSTR_TIME_IS_ZERO(planstate->instrument->firsttime))
+		return;
+
+	span_id = create_span_node(planstate, planstateTraceContext, parent_id, SPAN_NODE, NULL);
+	planstateTraceContext->ancestors = lcons(planstate->plan, planstateTraceContext->ancestors);
+
+	/* Walk the lefttree */
+	if (outerPlanState(planstate))
+		generate_span_from_planstate(outerPlanState(planstate), planstateTraceContext, span_id);
+	/* Walk the righttree */
+	if (innerPlanState(planstate))
+		generate_span_from_planstate(innerPlanState(planstate), planstateTraceContext, span_id);
+
+	/* Handle init plans */
+	foreach(l, planstate->initPlan)
+	{
+		SubPlanState *sstate = (SubPlanState *) lfirst(l);
+		PlanState  *splan = sstate->planstate;
+		uint64_t	init_span_id;
+
+		if (INSTR_TIME_IS_ZERO(splan->instrument->firsttime))
+			continue;
+		init_span_id = create_span_node(splan, planstateTraceContext, parent_id, SPAN_NODE_INIT_PLAN, sstate->subplan->plan_name);
+		generate_span_from_planstate(splan, planstateTraceContext, init_span_id);
+	}
+
+	/* Handle sub plans */
+	foreach(l, planstate->subPlan)
+	{
+		SubPlanState *sstate = (SubPlanState *) lfirst(l);
+		PlanState  *splan = sstate->planstate;
+		uint64_t	subplan_span_id;
+
+		if (INSTR_TIME_IS_ZERO(splan->instrument->firsttime))
+			continue;
+		subplan_span_id = create_span_node(splan, planstateTraceContext, parent_id, SPAN_NODE_SUBPLAN, sstate->subplan->plan_name);
+		generate_span_from_planstate(splan, planstateTraceContext, subplan_span_id);
+	}
+
+	/* Handle special nodes with children nodes */
+	switch (nodeTag(plan))
+	{
+		case T_Append:
+			generate_member_nodes(((AppendState *) planstate)->appendplans,
+								  ((AppendState *) planstate)->as_nplans, planstateTraceContext, span_id);
+			break;
+		case T_MergeAppend:
+			generate_member_nodes(((MergeAppendState *) planstate)->mergeplans,
+								  ((MergeAppendState *) planstate)->ms_nplans, planstateTraceContext, span_id);
+			break;
+		case T_BitmapAnd:
+			generate_member_nodes(((BitmapAndState *) planstate)->bitmapplans,
+								  ((BitmapAndState *) planstate)->nplans, planstateTraceContext, span_id);
+			break;
+		case T_BitmapOr:
+			generate_member_nodes(((BitmapOrState *) planstate)->bitmapplans,
+								  ((BitmapOrState *) planstate)->nplans, planstateTraceContext, span_id);
+			break;
+		case T_SubqueryScan:
+			generate_span_from_planstate(((SubqueryScanState *) planstate)->subplan, planstateTraceContext, span_id);
+			break;
+		case T_CustomScan:
+			generate_span_from_custom_scan((CustomScanState *) planstate, planstateTraceContext, span_id);
+			break;
+		default:
+			break;
+	}
+	planstateTraceContext->ancestors = list_delete_first(planstateTraceContext->ancestors);
+}
+
+/*
+ * Add plan counters to the Datum output
+ */
+static int
+add_plan_counters(const PlanCounters * plan_counters, int i, Datum *values)
+{
+	values[i++] = Float8GetDatumFast(plan_counters->startup_cost);
+	values[i++] = Float8GetDatumFast(plan_counters->total_cost);
+	values[i++] = Float8GetDatumFast(plan_counters->plan_rows);
+	values[i++] = Int32GetDatum(plan_counters->plan_width);
+	return i;
+}
+
+/*
+ * Add node counters to the Datum output
+ */
+static int
+add_node_counters(const NodeCounters * node_counters, int i, Datum *values)
+{
+	Datum		wal_bytes;
+	char		buf[256];
+	double		blk_read_time,
+				blk_write_time,
+				temp_blk_read_time,
+				temp_blk_write_time;
+	double		generation_counter,
+				inlining_counter,
+				optimization_counter,
+				emission_counter;
+	int64		jit_created_functions;
+
+	values[i++] = Int64GetDatumFast(node_counters->rows);
+	values[i++] = Int64GetDatumFast(node_counters->nloops);
+
+	/* Buffer usage */
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_hit);
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_read);
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_dirtied);
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.shared_blks_written);
+
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_hit);
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_read);
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_dirtied);
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.local_blks_written);
+
+	blk_read_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.blk_read_time);
+	blk_write_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.blk_write_time);
+	temp_blk_read_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.temp_blk_read_time);
+	temp_blk_write_time = INSTR_TIME_GET_MILLISEC(node_counters->buffer_usage.temp_blk_write_time);
+
+	values[i++] = Float8GetDatumFast(blk_read_time);
+	values[i++] = Float8GetDatumFast(blk_write_time);
+	values[i++] = Float8GetDatumFast(temp_blk_read_time);
+	values[i++] = Float8GetDatumFast(temp_blk_write_time);
+
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.temp_blks_read);
+	values[i++] = Int64GetDatumFast(node_counters->buffer_usage.temp_blks_written);
+
+	/* WAL usage */
+	values[i++] = Int64GetDatumFast(node_counters->wal_usage.wal_records);
+	values[i++] = Int64GetDatumFast(node_counters->wal_usage.wal_fpi);
+	snprintf(buf, sizeof buf, UINT64_FORMAT, node_counters->wal_usage.wal_bytes);
+
+	/* Convert to numeric. */
+	wal_bytes = DirectFunctionCall3(numeric_in,
+									CStringGetDatum(buf),
+									ObjectIdGetDatum(0),
+									Int32GetDatum(-1));
+	values[i++] = wal_bytes;
+
+	/* JIT usage */
+	generation_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.generation_counter);
+	inlining_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.inlining_counter);
+	optimization_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.optimization_counter);
+	emission_counter = INSTR_TIME_GET_MILLISEC(node_counters->jit_usage.emission_counter);
+	jit_created_functions = node_counters->jit_usage.created_functions;
+
+	values[i++] = Int64GetDatumFast(jit_created_functions);
+	values[i++] = Float8GetDatumFast(generation_counter);
+	values[i++] = Float8GetDatumFast(inlining_counter);
+	values[i++] = Float8GetDatumFast(optimization_counter);
+	values[i++] = Float8GetDatumFast(emission_counter);
+
+	return i;
+}
+
+/*
+ * Build the tuple for a Span and add it to the output
+ */
+static void
+add_result_span(ReturnSetInfo *rsinfo, Span * span,
+				const char *qbuffer, Size qbuffer_size)
+{
+	Datum		values[PG_TRACING_TRACES_COLS] = {0};
+	bool		nulls[PG_TRACING_TRACES_COLS] = {0};
+	const char *span_name;
+	const char *operation_name;
+	const char *sql_error_code;
+	int			i = 0;
+
+	span_name = get_span_name(span, qbuffer, qbuffer_size);
+	operation_name = get_operation_name(span, qbuffer, qbuffer_size);
+	sql_error_code = unpack_sql_state(span->sql_error_code);
+
+	Assert(span_name != NULL);
+	Assert(operation_name != NULL);
+	Assert(sql_error_code != NULL);
+
+	values[i++] = UInt64GetDatum(span->trace_id);
+	values[i++] = UInt64GetDatum(span->parent_id);
+	values[i++] = UInt64GetDatum(span->span_id);
+	values[i++] = UInt64GetDatum(span->query_id);
+	values[i++] = CStringGetTextDatum(span_name);
+	values[i++] = CStringGetTextDatum(operation_name);
+	values[i++] = Int64GetDatumFast(span->start);
+	values[i++] = Int16GetDatum(span->start_ns);
+
+	values[i++] = Int64GetDatumFast(span->duration_ns);
+	values[i++] = CStringGetTextDatum(sql_error_code);
+	values[i++] = UInt32GetDatum(span->be_pid);
+	values[i++] = UInt8GetDatum(span->nested_level);
+	values[i++] = UInt8GetDatum(span->subxact_count);
+	values[i++] = BoolGetDatum(span->is_top_span);
+
+	if ((span->type >= SPAN_NODE && span->type <= SPAN_NODE_UNKNOWN)
+		|| span->type == SPAN_PLANNER)
+	{
+		i = add_plan_counters(&span->plan_counters, i, values);
+		i = add_node_counters(&span->node_counters, i, values);
+
+		values[i++] = Int64GetDatumFast(span->startup);
+		if (span->parameter_offset != -1 && qbuffer_size > 0 && qbuffer_size > span->parameter_offset)
+			values[i++] = CStringGetTextDatum(qbuffer + span->parameter_offset);
+	}
+
+	for (int j = i; j < PG_TRACING_TRACES_COLS; j++)
+		nulls[j] = 1;
+
+	tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+}
+
+/*
+ * Return spans as a result set.
+ *
+ * Accept a consume parameter. When consume is set,
+ * we empty the shared buffer and truncate query text.
+ */
+Datum
+pg_tracing_spans(PG_FUNCTION_ARGS)
+{
+	bool		consume;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Span	   *span;
+	const char *qbuffer;
+	Size		qbuffer_size = 0;
+
+	consume = PG_GETARG_BOOL(0);
+	if (!pg_tracing)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("pg_tracing must be loaded via shared_preload_libraries")));
+	InitMaterializedSRF(fcinfo, 0);
+
+	/*
+	 * If this query was sampled and we're consuming tracing_spans buffer, the
+	 * spans will target a query string that doesn't exist anymore in the
+	 * query file. Better abort the sampling, clean ongoing traces and remove
+	 * any possible parallel context pushed Since this will be called within
+	 * an ExecutorRun, we will need to check for current_trace_spans at the
+	 * end of the ExecutorRun hook.
+	 */
+	cleanup_tracing();
+
+	qbuffer = qtext_load_file(&qbuffer_size);
+	{
+		volatile	pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing;
+
+		SpinLockAcquire(&s->mutex);
+		for (int i = 0; i < shared_spans->end; i++)
+		{
+			span = shared_spans->spans + i;
+			add_result_span(rsinfo, span, qbuffer, qbuffer_size);
+		}
+
+		/* Consume is set, remove spans from the shared buffer */
+		if (consume)
+		{
+			shared_spans->end = 0;
+			/* We only truncate query file if there's no active writers */
+			if (s->n_writers == 0)
+			{
+				s->extent = 0;
+				pg_truncate(PG_TRACING_TEXT_FILE, 0);
+			}
+			else
+				s->stats.failed_truncates++;
+		}
+		s->stats.last_consume = GetCurrentTimestamp();
+		SpinLockRelease(&s->mutex);
+	}
+	return (Datum) 0;
+}
+
+/*
+ * Return statistics of pg_tracing.
+ */
+Datum
+pg_tracing_info(PG_FUNCTION_ARGS)
+{
+	pgTracingStats stats;
+	TupleDesc	tupdesc;
+	Datum		values[PG_TRACING_INFO_COLS] = {0};
+	bool		nulls[PG_TRACING_INFO_COLS] = {0};
+	int			i = 0;
+
+	if (!pg_tracing)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("pg_tracing must be loaded via shared_preload_libraries")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	/* Get a copy of the pg_tracing stats */
+	{
+		volatile	pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing;
+
+		SpinLockAcquire(&s->mutex);
+		stats = s->stats;
+		SpinLockRelease(&s->mutex);
+	}
+
+	values[i++] = Int64GetDatum(stats.traces);
+	values[i++] = Int64GetDatum(stats.spans);
+	values[i++] = Int64GetDatum(stats.dropped_spans);
+	values[i++] = Int64GetDatum(stats.failed_truncates);
+	values[i++] = TimestampTzGetDatum(stats.last_consume);
+	values[i++] = TimestampTzGetDatum(stats.stats_reset);
+	values[i++] = Float8GetDatum(pg_tracing_sample_rate);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
+/*
+ * Get an empty pgTracingStats
+ */
+static pgTracingStats
+get_empty_pg_tracing_stats(void)
+{
+	pgTracingStats stats;
+
+	stats.traces = 0;
+	stats.spans = 0;
+	stats.dropped_spans = 0;
+	stats.failed_truncates = 0;
+	stats.last_consume = 0;
+	stats.stats_reset = GetCurrentTimestamp();
+	return stats;
+}
+
+/*
+ * Reset pg_tracing statistics.
+ */
+Datum
+pg_tracing_reset(PG_FUNCTION_ARGS)
+{
+	/*
+	 * Reset statistics for pg_tracing since all entries are removed.
+	 */
+	pgTracingStats empty_stats = get_empty_pg_tracing_stats();
+
+	{
+		volatile	pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing;
+
+		SpinLockAcquire(&s->mutex);
+		s->stats = empty_stats;
+		SpinLockRelease(&s->mutex);
+	}
+	PG_RETURN_VOID();
+}
diff --git a/contrib/pg_tracing/pg_tracing.conf b/contrib/pg_tracing/pg_tracing.conf
new file mode 100644
index 0000000000..ad6b5e7c1d
--- /dev/null
+++ b/contrib/pg_tracing/pg_tracing.conf
@@ -0,0 +1 @@
+shared_preload_libraries = 'pg_tracing'
diff --git a/contrib/pg_tracing/pg_tracing.control b/contrib/pg_tracing/pg_tracing.control
new file mode 100644
index 0000000000..efa78a6610
--- /dev/null
+++ b/contrib/pg_tracing/pg_tracing.control
@@ -0,0 +1,5 @@
+# pg_tracing extension
+comment = 'Distributed tracing for postgres'
+default_version = '1.0'
+module_pathname = '$libdir/pg_tracing'
+relocatable = true
diff --git a/contrib/pg_tracing/pg_tracing.h b/contrib/pg_tracing/pg_tracing.h
new file mode 100644
index 0000000000..63675c4b58
--- /dev/null
+++ b/contrib/pg_tracing/pg_tracing.h
@@ -0,0 +1,43 @@
+/*-------------------------------------------------------------------------
+ * pg_tracing.h
+ *
+ * IDENTIFICATION
+ *	  contrib/pg_tracing/pg_tracing.h
+ *-------------------------------------------------------------------------
+ */
+#ifndef _PG_TRACING_H_
+#define _PG_TRACING_H_
+
+#include "postgres.h"
+#include "span.h"
+
+#include "storage/s_lock.h"
+
+/*
+ * Global statistics for pg_tracing
+ */
+typedef struct pgTracingStats
+{
+	int64		traces;			/* number of traces processed */
+	int64		spans;			/* number of spans processed */
+	int64		dropped_spans;	/* number of dropped spans due to full buffer */
+	int64		failed_truncates;	/* number of failed query file truncates
+									 * due to active writer */
+	TimestampTz last_consume;	/* Last time the shared spans buffer was
+								 * consumed */
+	TimestampTz stats_reset;	/* Last time stats were reset */
+}			pgTracingStats;
+
+/*
+ * Global shared state
+ */
+typedef struct pgTracingSharedState
+{
+	slock_t		mutex;			/* protects shared stats fields and
+								 * shared_spans buffer */
+	Size		extent;			/* current extent of query file */
+	int			n_writers;		/* number of active writers to query file */
+	pgTracingStats stats;		/* global statistics for pg_tracing */
+}			pgTracingSharedState;
+
+#endif
diff --git a/contrib/pg_tracing/pg_tracing_explain.c b/contrib/pg_tracing/pg_tracing_explain.c
new file mode 100644
index 0000000000..7f7155d2d3
--- /dev/null
+++ b/contrib/pg_tracing/pg_tracing_explain.c
@@ -0,0 +1,527 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_tracing_explain.c
+ *
+ *
+ * Copyright (c) 2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  contrib/pg_tracing/pg_tracing_explain.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pg_tracing_explain.h"
+
+#include "nodes/makefuncs.h"
+#include "utils/builtins.h"
+#include "parser/parsetree.h"
+#include "nodes/extensible.h"
+#include "utils/lsyscache.h"
+#include "utils/ruleutils.h"
+
+
+static void
+ExplainTargetRel(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, Index rti, StringInfo str)
+{
+	char	   *objectname = NULL;
+	char	   *namespace = NULL;
+	RangeTblEntry *rte;
+	char	   *refname;
+
+	List	   *rtable;
+
+	rtable = planstate->state->es_range_table;
+	rte = rt_fetch(rti, rtable);
+
+	refname = (char *) list_nth(planstateTraceContext->rtable_names, rti - 1);
+	if (refname == NULL)
+		refname = rte->eref->aliasname;
+
+	switch (nodeTag(planstate->plan))
+	{
+		case T_SeqScan:
+		case T_SampleScan:
+		case T_IndexScan:
+		case T_IndexOnlyScan:
+		case T_BitmapHeapScan:
+		case T_TidScan:
+		case T_TidRangeScan:
+		case T_ForeignScan:
+		case T_CustomScan:
+		case T_ModifyTable:
+			/* Assert it's on a real relation */
+			Assert(rte->rtekind == RTE_RELATION);
+			objectname = get_rel_name(rte->relid);
+			break;
+		case T_FunctionScan:
+			{
+				FunctionScan *fscan = (FunctionScan *) planstate->plan;
+
+				/* Assert it's on a RangeFunction */
+				Assert(rte->rtekind == RTE_FUNCTION);
+
+				/*
+				 * If the expression is still a function call of a single
+				 * function, we can get the real name of the function.
+				 * Otherwise, punt.  (Even if it was a single function call
+				 * originally, the optimizer could have simplified it away.)
+				 */
+				if (list_length(fscan->functions) == 1)
+				{
+					RangeTblFunction *rtfunc = (RangeTblFunction *) linitial(fscan->functions);
+
+					if (IsA(rtfunc->funcexpr, FuncExpr))
+					{
+						FuncExpr   *funcexpr = (FuncExpr *) rtfunc->funcexpr;
+						Oid			funcid = funcexpr->funcid;
+
+						objectname = get_func_name(funcid);
+					}
+				}
+			}
+			break;
+		case T_TableFuncScan:
+			Assert(rte->rtekind == RTE_TABLEFUNC);
+			objectname = "xmltable";
+			break;
+		case T_ValuesScan:
+			Assert(rte->rtekind == RTE_VALUES);
+			break;
+		case T_CteScan:
+			/* Assert it's on a non-self-reference CTE */
+			Assert(rte->rtekind == RTE_CTE);
+			Assert(!rte->self_reference);
+			objectname = rte->ctename;
+			break;
+		case T_NamedTuplestoreScan:
+			Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
+			objectname = rte->enrname;
+			break;
+		case T_WorkTableScan:
+			/* Assert it's on a self-reference CTE */
+			Assert(rte->rtekind == RTE_CTE);
+			Assert(rte->self_reference);
+			objectname = rte->ctename;
+			break;
+		default:
+			break;
+	}
+
+	appendStringInfoString(str, " on");
+	if (namespace != NULL)
+		appendStringInfo(str, " %s.%s", quote_identifier(namespace),
+						 quote_identifier(objectname));
+	else if (objectname != NULL)
+		appendStringInfo(str, " %s", quote_identifier(objectname));
+	if (objectname == NULL || strcmp(refname, objectname) != 0)
+		appendStringInfo(str, " %s", quote_identifier(refname));
+}
+
+
+static void
+ExplainScanTarget(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const Scan *plan, StringInfo str)
+{
+	ExplainTargetRel(planstateTraceContext, planstate, plan->scanrelid, str);
+}
+
+static void
+ExplainModifyTarget(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const ModifyTable *plan, StringInfo str)
+{
+	ExplainTargetRel(planstateTraceContext, planstate, plan->nominalRelation, str);
+}
+
+static void
+ExplainIndexScanDetails(Oid indexId, ScanDirection indexorderdir,
+						StringInfo str)
+{
+	const char *indexname = get_rel_name(indexId);
+
+	if (ScanDirectionIsBackward(indexorderdir))
+		appendStringInfoString(str, " Backward");
+	appendStringInfo(str, " using %s", quote_identifier(indexname));
+}
+
+static void
+plan_to_rel_name(const PlanState *planstate, const planstateTraceContext * planstateTraceContext, StringInfo str)
+{
+	const Plan *plan = planstate->plan;
+
+	switch (nodeTag(plan))
+	{
+		case T_SeqScan:
+		case T_SampleScan:
+		case T_BitmapHeapScan:
+		case T_TidScan:
+		case T_TidRangeScan:
+		case T_SubqueryScan:
+		case T_FunctionScan:
+		case T_TableFuncScan:
+		case T_ValuesScan:
+		case T_CteScan:
+		case T_WorkTableScan:
+			ExplainScanTarget(planstateTraceContext, planstate, (Scan *) plan, str);
+			break;
+		case T_ForeignScan:
+		case T_CustomScan:
+			if (((Scan *) plan)->scanrelid > 0)
+				ExplainScanTarget(planstateTraceContext, planstate, (Scan *) plan, str);
+			break;
+		case T_IndexScan:
+			{
+				IndexScan  *indexscan = (IndexScan *) plan;
+
+				ExplainIndexScanDetails(indexscan->indexid,
+										indexscan->indexorderdir,
+										str);
+				ExplainScanTarget(planstateTraceContext, planstate, (Scan *) indexscan, str);
+			}
+			break;
+		case T_IndexOnlyScan:
+			{
+				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
+
+				ExplainIndexScanDetails(indexonlyscan->indexid,
+										indexonlyscan->indexorderdir,
+										str);
+				ExplainScanTarget(planstateTraceContext, planstate, (Scan *) indexonlyscan, str);
+			}
+			break;
+		case T_BitmapIndexScan:
+			{
+				BitmapIndexScan *bitmapindexscan = (BitmapIndexScan *) plan;
+				const char *indexname = get_rel_name(bitmapindexscan->indexid);
+
+				appendStringInfo(str, " on %s",
+								 quote_identifier(indexname));
+			}
+			break;
+		case T_ModifyTable:
+			ExplainModifyTarget(planstateTraceContext, planstate, (ModifyTable *) plan, str);
+			break;
+		case T_NestLoop:
+		case T_MergeJoin:
+		case T_HashJoin:
+			{
+				const char *jointype;
+
+				switch (((Join *) plan)->jointype)
+				{
+					case JOIN_INNER:
+						jointype = "Inner";
+						break;
+					case JOIN_LEFT:
+						jointype = "Left";
+						break;
+					case JOIN_FULL:
+						jointype = "Full";
+						break;
+					case JOIN_RIGHT:
+						jointype = "Right";
+						break;
+					case JOIN_SEMI:
+						jointype = "Semi";
+						break;
+					case JOIN_ANTI:
+						jointype = "Anti";
+						break;
+					case JOIN_RIGHT_ANTI:
+						jointype = "Right Anti";
+						break;
+					default:
+						jointype = "???";
+						break;
+				}
+				if (((Join *) plan)->jointype != JOIN_INNER)
+					appendStringInfo(str, " %s Join", jointype);
+				else if (!IsA(plan, NestLoop))
+					appendStringInfoString(str, " Join");
+			}
+			break;
+		case T_SetOp:
+			{
+				const char *setopcmd;
+
+				switch (((SetOp *) plan)->cmd)
+				{
+					case SETOPCMD_INTERSECT:
+						setopcmd = "Intersect";
+						break;
+					case SETOPCMD_INTERSECT_ALL:
+						setopcmd = "Intersect All";
+						break;
+					case SETOPCMD_EXCEPT:
+						setopcmd = "Except";
+						break;
+					case SETOPCMD_EXCEPT_ALL:
+						setopcmd = "Except All";
+						break;
+					default:
+						setopcmd = "???";
+						break;
+				}
+				appendStringInfo(str, " %s", setopcmd);
+			}
+			break;
+		default:
+			break;
+	}
+}
+
+static void
+add_scan_qual(StringInfo str, const PlanState *planstate, List *qual,
+			  const char *qlabel, List *ancestors, List *deparse_ctx,
+			  bool useprefix)
+{
+	List	   *context;
+	char	   *exprstr;
+	Node	   *node;
+
+	if (qual == NIL)
+		return;
+
+	node = (Node *) make_ands_explicit(qual);
+
+	/* Set up deparsing context */
+	context = set_deparse_context_plan(deparse_ctx,
+									   planstate->plan,
+									   ancestors);
+
+	/* Deparse the expression */
+	appendStringInfoString(str, "|");
+	exprstr = deparse_expression(node, context, useprefix, false);
+	appendStringInfoString(str, qlabel);
+	appendStringInfoString(str, exprstr);
+}
+
+char const *
+plan_to_operation(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const char *spanName)
+{
+	StringInfo	operation_name = makeStringInfo();
+	Plan const *plan = planstate->plan;
+	List	   *ancestors = planstateTraceContext->ancestors;
+	List	   *deparse_ctx = planstateTraceContext->deparse_ctx;
+
+	if (plan->parallel_aware)
+		appendStringInfoString(operation_name, "Parallel ");
+	if (plan->async_capable)
+		appendStringInfoString(operation_name, "Async ");
+	appendStringInfoString(operation_name, spanName);
+
+	plan_to_rel_name(planstate, planstateTraceContext, operation_name);
+
+	switch (nodeTag(plan))
+	{
+		case T_IndexScan:
+			add_scan_qual(operation_name, planstate, ((IndexScan *) plan)->indexqualorig, "Index Cond: ",
+						  ancestors, deparse_ctx, false);
+			break;
+		case T_IndexOnlyScan:
+			add_scan_qual(operation_name, planstate, ((IndexOnlyScan *) plan)->indexqual, "Index Cond: ",
+						  ancestors, deparse_ctx, false);
+			break;
+		case T_BitmapIndexScan:
+			add_scan_qual(operation_name, planstate, ((BitmapIndexScan *) plan)->indexqualorig, "Index Cond: ",
+						  ancestors, deparse_ctx, false);
+			break;
+		case T_BitmapHeapScan:
+			add_scan_qual(operation_name, planstate, ((BitmapHeapScan *) plan)->bitmapqualorig, "Recheck Cond: ",
+						  ancestors, deparse_ctx, false);
+			break;
+		case T_SeqScan:
+		case T_ValuesScan:
+		case T_CteScan:
+		case T_NamedTuplestoreScan:
+		case T_WorkTableScan:
+		case T_SubqueryScan:
+		case T_Gather:
+		case T_GatherMerge:
+		case T_FunctionScan:
+		case T_TableFuncScan:
+		case T_TidRangeScan:
+		case T_ForeignScan:
+		case T_CustomScan:
+		case T_Agg:
+		case T_WindowAgg:
+		case T_Group:
+		case T_Result:
+			add_scan_qual(operation_name, planstate, plan->qual, "Filter : ",
+						  ancestors, deparse_ctx, false);
+			break;
+		case T_NestLoop:
+			add_scan_qual(operation_name, planstate, ((NestLoop *) plan)->join.joinqual, "Join Filter : ",
+						  ancestors, deparse_ctx, false);
+			add_scan_qual(operation_name, planstate, plan->qual, "Filter : ",
+						  ancestors, deparse_ctx, false);
+			break;
+		case T_MergeJoin:
+			add_scan_qual(operation_name, planstate, ((MergeJoin *) plan)->mergeclauses, "Merge Cond: ",
+						  ancestors, deparse_ctx, false);
+			add_scan_qual(operation_name, planstate, ((MergeJoin *) plan)->join.joinqual, "Join Filter: ",
+						  ancestors, deparse_ctx, false);
+			add_scan_qual(operation_name, planstate, plan->qual, "Filter : ",
+						  ancestors, deparse_ctx, false);
+			break;
+		case T_HashJoin:
+			add_scan_qual(operation_name, planstate, ((HashJoin *) plan)->hashclauses, "Hash Cond: ",
+						  ancestors, deparse_ctx, false);
+			add_scan_qual(operation_name, planstate, ((MergeJoin *) plan)->join.joinqual, "Join Filter: ",
+						  ancestors, deparse_ctx, false);
+			add_scan_qual(operation_name, planstate, plan->qual, "Filter : ",
+						  ancestors, deparse_ctx, false);
+			break;
+		default:
+			break;
+	}
+
+	return operation_name->data;
+}
+
+const char *
+plan_to_span_name(const Plan *plan)
+{
+	const char *custom_name;
+
+	switch (nodeTag(plan))
+	{
+		case T_Result:
+			return "Result";
+		case T_ProjectSet:
+			return "ProjectSet";
+		case T_ModifyTable:
+			switch (((ModifyTable *) plan)->operation)
+			{
+				case CMD_INSERT:
+					return "Insert";
+				case CMD_UPDATE:
+					return "Update";
+				case CMD_DELETE:
+					return "Delete";
+				case CMD_MERGE:
+					return "Merge";
+				default:
+					return "???";
+			}
+		case T_Append:
+			return "Append";
+		case T_MergeAppend:
+			return "MergeAppend";
+		case T_RecursiveUnion:
+			return "RecursiveUnion";
+		case T_BitmapAnd:
+			return "BitmapAnd";
+		case T_BitmapOr:
+			return "BitmapOr";
+		case T_NestLoop:
+			return "NestedLoop";
+		case T_MergeJoin:
+			return "Merge";		/* "Join" gets added by jointype switch */
+		case T_HashJoin:
+			return "Hash";		/* "Join" gets added by jointype switch */
+		case T_SeqScan:
+			return "SeqScan";
+		case T_SampleScan:
+			return "SampleScan";
+		case T_Gather:
+			return "Gather";
+		case T_GatherMerge:
+			return "GatherMerge";
+		case T_IndexScan:
+			return "IndexScan";
+		case T_IndexOnlyScan:
+			return "IndexOnlyScan";
+		case T_BitmapIndexScan:
+			return "BitmapIndexScan";
+		case T_BitmapHeapScan:
+			return "BitmapHeapScan";
+		case T_TidScan:
+			return "TidScan";
+		case T_TidRangeScan:
+			return "TidRangeScan";
+		case T_SubqueryScan:
+			return "SubqueryScan";
+		case T_FunctionScan:
+			return "FunctionScan";
+		case T_TableFuncScan:
+			return "TableFunctionScan";
+		case T_ValuesScan:
+			return "ValuesScan";
+		case T_CteScan:
+			return "CTEScan";
+		case T_NamedTuplestoreScan:
+			return "NamedTuplestoreScan";
+		case T_WorkTableScan:
+			return "WorkTableScan";
+		case T_ForeignScan:
+			switch (((ForeignScan *) plan)->operation)
+			{
+				case CMD_SELECT:
+					return "ForeignScan";
+				case CMD_INSERT:
+					return "ForeignInsert";
+				case CMD_UPDATE:
+					return "ForeignUpdate";
+				case CMD_DELETE:
+					return "ForeignDelete";
+				default:
+					return "???";
+			}
+		case T_CustomScan:
+			custom_name = ((CustomScan *) plan)->methods->CustomName;
+			if (custom_name)
+				return psprintf("CustomScan (%s)", custom_name);
+			else
+				return "CustomScan";
+		case T_Material:
+			return "Materialize";
+		case T_Memoize:
+			return "Memoize";
+		case T_Sort:
+			return "Sort";
+		case T_IncrementalSort:
+			return "IncrementalSort";
+		case T_Group:
+			return "Group";
+		case T_Agg:
+			{
+				Agg		   *agg = (Agg *) plan;
+
+				switch (agg->aggstrategy)
+				{
+					case AGG_PLAIN:
+						return "Aggregate";
+					case AGG_SORTED:
+						return "GroupAggregate";
+					case AGG_HASHED:
+						return "HashAggregate";
+					case AGG_MIXED:
+						return "MixedAggregate";
+					default:
+						return "Aggregate ???";
+				}
+			}
+		case T_WindowAgg:
+			return "WindowAgg";
+		case T_Unique:
+			return "Unique";
+		case T_SetOp:
+			switch (((SetOp *) plan)->strategy)
+			{
+				case SETOP_SORTED:
+					return "SetOp";
+				case SETOP_HASHED:
+					return "HashSetOp";
+				default:
+					return "SetOp ???";
+			}
+		case T_LockRows:
+			return "LockRows";
+		case T_Limit:
+			return "Limit";
+		case T_Hash:
+			return "Hash";
+		default:
+			return "???";
+	}
+}
diff --git a/contrib/pg_tracing/pg_tracing_explain.h b/contrib/pg_tracing/pg_tracing_explain.h
new file mode 100644
index 0000000000..6551e15d61
--- /dev/null
+++ b/contrib/pg_tracing/pg_tracing_explain.h
@@ -0,0 +1,20 @@
+#ifndef _PG_TRACING_EXPLAIN_H_
+#define _PG_TRACING_EXPLAIN_H_
+
+#include "span.h"
+#include "executor/execdesc.h"
+
+/* Context needed when generating spans from planstate */
+typedef struct planstateTraceContext
+{
+	pgTracingTrace *trace;
+	int			sql_error_code;
+	List	   *ancestors;
+	List	   *deparse_ctx;
+	List	   *rtable_names;
+}			planstateTraceContext;
+
+extern const char *plan_to_span_name(const Plan *plan);
+extern const char *plan_to_operation(const planstateTraceContext * planstateTraceContext, const PlanState *planstate, const char *spanName);
+
+#endif
diff --git a/contrib/pg_tracing/query_process.c b/contrib/pg_tracing/query_process.c
new file mode 100644
index 0000000000..0d6e93bbe5
--- /dev/null
+++ b/contrib/pg_tracing/query_process.c
@@ -0,0 +1,473 @@
+#include "query_process.h"
+#include "pg_tracing.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+#include "parser/scanner.h"
+#include "nodes/extensible.h"
+
+/*
+ * Initialize traceparent parameter fields
+ */
+void
+initialize_traceparent(pgTracingTrace * traceparent_parameter)
+{
+	traceparent_parameter->sampled = 0;
+	traceparent_parameter->trace_id = 0;
+	traceparent_parameter->parent_id = 0;
+}
+
+/*
+ * Extract traceparent parameters from SQLCommenter
+ *
+ * We're expecting the query to start with a SQLComment containing the
+ * traceparent parameter
+ * "/\*traceparent='00-00000000000000000000000000000009-0000000000000005-01'*\/
+ * SELECT 1;"
+ * Traceparent has the following format: version-traceid-parentid-sampled
+ * We also accept SQLCommenter as a parameter. In this case, we don't have
+ * the start and end of comments.
+ *
+ * if is_parameter is true, the SQLComment is passed as a query parameter and
+ * won't have the comment start and end.
+ */
+void
+extract_traceparent(pgTracingTrace * pgTracingTrace, const char *sqlcomment_str, bool is_parameter)
+{
+	const char *expected_start = "/*";
+	const char *traceparent;
+	const char *end_sqlcomment;
+	char	   *endptr;
+
+	if (!is_parameter)
+	{
+		/*
+		 * Look for the start of a comment We're expecting the comment to be
+		 * at the very start of the query
+		 */
+		for (size_t i = 0; i < strlen(expected_start); i++)
+		{
+			if (sqlcomment_str[i] != expected_start[i])
+				return;
+		}
+
+		/*
+		 * Check that we have an end
+		 */
+		end_sqlcomment = strstr(sqlcomment_str, "*/");
+		if (end_sqlcomment == NULL)
+			return;
+	}
+	else
+		end_sqlcomment = sqlcomment_str + strlen(sqlcomment_str);
+
+	/*
+	 * Locate traceparent parameter and make sure it has the expected size
+	 * "traceparent" + "=" + '' -> 13 characters
+	 * 00-00000000000000000000000000000009-0000000000000005-01 -> 55
+	 * characters
+	 */
+	traceparent = strstr(sqlcomment_str, "traceparent='");
+	if (traceparent == NULL || traceparent > end_sqlcomment || end_sqlcomment - traceparent < 55 + 13)
+		return;
+
+	/*
+	 * Move to the start of the traceparent values
+	 */
+	traceparent = traceparent + 13;
+
+	/*
+	 * Check that '-' are at the expected places
+	 */
+	if (traceparent[2] != '-' || traceparent[35] != '-' || traceparent[52] != '-')
+		return;
+
+	/*
+	 * Parse traceparent parameters
+	 */
+	errno = 0;
+	pgTracingTrace->trace_id = strtol(&traceparent[3], &endptr, 16);
+	if (endptr != traceparent + 35 || errno)
+		return;
+	pgTracingTrace->parent_id = strtol(&traceparent[36], &endptr, 16);
+	if (endptr != traceparent + 52 || errno)
+		return;
+	pgTracingTrace->sampled = strtol(&traceparent[53], &endptr, 16);
+	if (endptr != traceparent + 55 || errno)
+
+		/*
+		 * Just to be sure, reset sampled on error
+		 */
+		pgTracingTrace->sampled = 0;
+}
+
+
+/*
+ * comp_location: comparator for qsorting LocationLen structs by location
+ */
+static int
+comp_location(const void *a, const void *b)
+{
+	int			l = ((const LocationLen *) a)->location;
+	int			r = ((const LocationLen *) b)->location;
+
+	if (l < r)
+		return -1;
+	else if (l > r)
+		return +1;
+	else
+		return 0;
+}
+
+
+/*
+ * Normalise query and fill paramStr.
+ * Normalised query will separate tokens with a single space and
+ * parameters are replaced by $1, $2...
+ * Parameters are put in the paramStr wich will contain all parameters values
+ * using the format: "$1 = 0, $2 = 'v'"
+ */
+const char *
+normalise_query_parameters(const JumbleState *jstate, const char *query,
+						   int query_loc, int *query_len_p, char **paramStr)
+{
+	char	   *norm_query;
+	int			query_len = *query_len_p;
+	int			norm_query_buflen,	/* Space allowed for norm_query */
+				n_quer_loc = 0;
+	LocationLen *locs;
+	core_yyscan_t yyscanner;
+	core_yy_extra_type yyextra;
+	core_YYSTYPE yylval;
+	YYLTYPE		yylloc;
+	int			current_loc = 0;
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+
+	norm_query_buflen = query_len + jstate->clocations_count * 10;
+	Assert(norm_query_buflen > 0);
+	locs = jstate->clocations;
+
+	/* Allocate result buffer */
+	norm_query = palloc(norm_query_buflen + 1);
+
+	if (jstate->clocations_count > 1)
+		qsort(jstate->clocations, jstate->clocations_count,
+			  sizeof(LocationLen), comp_location);
+
+	/* initialize the flex scanner --- should match raw_parser() */
+	yyscanner = scanner_init(query + query_loc,
+							 &yyextra,
+							 &ScanKeywords,
+							 ScanKeywordTokens);
+
+	for (;;)
+	{
+		int			loc = locs[current_loc].location;
+		int			tok;
+
+		loc -= query_loc;
+
+		tok = core_yylex(&yylval, &yylloc, yyscanner);
+
+		/*
+		 * We should not hit end-of-string, but if we do, behave sanely
+		 */
+		if (tok == 0)
+			break;				/* out of inner for-loop */
+		if (yylloc > query_len)
+			break;
+
+		/*
+		 * We should find the token position exactly, but if we somehow run
+		 * past it, work with that.
+		 */
+		if (current_loc < jstate->clocations_count && yylloc >= loc)
+		{
+			appendStringInfo(&buf,
+							 "%s$%d = ",
+							 current_loc > 0 ? ", " : "",
+							 current_loc + 1);
+			if (query[loc] == '-')
+			{
+				/*
+				 * It's a negative value - this is the one and only case where
+				 * we replace more than a single token.
+				 *
+				 * Do not compensate for the core system's special-case
+				 * adjustment of location to that of the leading '-' operator
+				 * in the event of a negative constant.  It is also useful for
+				 * our purposes to start from the minus symbol.  In this way,
+				 * queries like "select * from foo where bar = 1" and "select *
+				 * from foo where bar = -2" will have identical normalized
+				 * query strings.
+				 */
+				appendStringInfoChar(&buf, '-');
+				tok = core_yylex(&yylval, &yylloc, yyscanner);
+				if (tok == 0)
+					break;		/* out of inner for-loop */
+			}
+			if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0)
+			{
+				norm_query[n_quer_loc++] = ' ';
+			}
+
+			/*
+			 * Append the current parameter $x in the normalised query
+			 */
+			n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
+								  current_loc + 1 + jstate->highest_extern_param_id);
+
+			appendStringInfoString(&buf, yyextra.scanbuf + yylloc);
+
+			current_loc++;
+		}
+		else
+		{
+			int			to_copy;
+
+			if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0)
+			{
+				norm_query[n_quer_loc++] = ' ';
+			}
+			to_copy = strlen(yyextra.scanbuf + yylloc);
+			Assert(n_quer_loc + to_copy < norm_query_buflen + 1);
+			memcpy(norm_query + n_quer_loc, yyextra.scanbuf + yylloc, to_copy);
+			n_quer_loc += to_copy;
+		}
+	}
+	scanner_finish(yyscanner);
+
+	*query_len_p = n_quer_loc;
+	norm_query[n_quer_loc] = '\0';
+	*paramStr = buf.data;
+	return norm_query;
+}
+
+/*
+ * Normalise query: tokens will be separated by a single space
+ */
+const char *
+normalise_query(const char *query, int query_loc, int *query_len_p)
+{
+	char	   *norm_query;
+	int			query_len = *query_len_p;
+	int			norm_query_buflen = query_len;
+	int			n_quer_loc = 0;
+	core_yyscan_t yyscanner;
+	core_yy_extra_type yyextra;
+	core_YYSTYPE yylval;
+	YYLTYPE		yylloc;
+
+	/* Allocate result buffer */
+	norm_query = palloc(norm_query_buflen + 2);
+
+	/* initialize the flex scanner --- should match raw_parser() */
+	yyscanner = scanner_init(query + query_loc, &yyextra, &ScanKeywords, ScanKeywordTokens);
+	for (;;)
+	{
+		int			tok;
+		int			to_copy;
+
+		tok = core_yylex(&yylval, &yylloc, yyscanner);
+
+		if (tok == 0)
+			break;				/* out of inner for-loop */
+		if (yylloc > query_len)
+			break;
+
+		if (yylloc > 0 && yyextra.scanbuf[yylloc - 1] == ' ' && n_quer_loc > 0)
+		{
+			norm_query[n_quer_loc++] = ' ';
+		}
+		to_copy = strlen(yyextra.scanbuf + yylloc);
+		Assert(n_quer_loc + to_copy < norm_query_buflen + 2);
+		memcpy(norm_query + n_quer_loc, yyextra.scanbuf + yylloc, to_copy);
+		n_quer_loc += to_copy;
+	}
+	scanner_finish(yyscanner);
+
+	*query_len_p = n_quer_loc;
+	norm_query[n_quer_loc] = '\0';
+	return norm_query;
+}
+
+/*
+ * Store text in the pg_tracing stat file
+ */
+bool
+text_store_file(pgTracingSharedState * pg_tracing, const char *text, int text_len,
+				Size *query_offset)
+{
+	Size		off;
+	int			fd;
+
+	{
+		volatile	pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing;
+
+		SpinLockAcquire(&s->mutex);
+		off = s->extent;
+		s->extent += text_len + 1;
+		s->n_writers++;
+		SpinLockRelease(&s->mutex);
+	}
+
+	/*
+	 * Don't allow the file to grow larger than what qtext_load_file can
+	 * (theoretically) handle.  This has been seen to be reachable on 32-bit
+	 * platforms.
+	 */
+	if (unlikely(text_len >= MaxAllocHugeSize - off))
+	{
+		errno = EFBIG;			/* not quite right, but it'll do */
+		fd = -1;
+		goto error;
+	}
+
+	/* Now write the data into the successfully-reserved part of the file */
+	fd = OpenTransientFile(PG_TRACING_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
+	if (fd < 0)
+		goto error;
+
+	if (pg_pwrite(fd, text, text_len, off) != text_len)
+		goto error;
+	if (pg_pwrite(fd, "\0", 1, off + text_len) != 1)
+		goto error;
+
+	CloseTransientFile(fd);
+
+	/* Mark our write complete */
+	{
+		volatile	pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing;
+
+		SpinLockAcquire(&s->mutex);
+		s->n_writers--;
+		SpinLockRelease(&s->mutex);
+	}
+
+	/*
+	 * Set offset once write was succesful
+	 */
+	*query_offset = off;
+
+	return true;
+
+error:
+	ereport(LOG,
+			(errcode_for_file_access(),
+			 errmsg("could not write file \"%s\": %m",
+					PG_TRACING_TEXT_FILE)));
+
+	if (fd >= 0)
+		CloseTransientFile(fd);
+
+	/* Mark our write complete */
+	{
+		volatile	pgTracingSharedState *s = (volatile pgTracingSharedState *) pg_tracing;
+
+		SpinLockAcquire(&s->mutex);
+		s->n_writers--;
+		SpinLockRelease(&s->mutex);
+	}
+
+	return false;
+}
+
+/*
+ * Read the external query text file into a malloc'd buffer.
+ *
+ * Returns NULL (without throwing an error) if unable to read, eg file not
+ * there or insufficient memory.
+ *
+ * On success, the buffer size is also returned into *buffer_size.
+ *
+ * This can be called without any lock on pgss->lock, but in that case the
+ * caller is responsible for verifying that the result is sane.
+ */
+const char *
+qtext_load_file(Size *buffer_size)
+{
+	char	   *buf;
+	int			fd;
+	struct stat stat;
+	Size		nread;
+
+	fd = OpenTransientFile(PG_TRACING_TEXT_FILE, O_RDONLY | PG_BINARY);
+	if (fd < 0)
+	{
+		if (errno != ENOENT)
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							PG_TRACING_TEXT_FILE)));
+		return NULL;
+	}
+
+	/* Get file length */
+	if (fstat(fd, &stat))
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not stat file \"%s\": %m",
+						PG_TRACING_TEXT_FILE)));
+		CloseTransientFile(fd);
+		return NULL;
+	}
+
+	/* Allocate buffer; beware that off_t might be wider than size_t */
+	if (stat.st_size <= MaxAllocHugeSize)
+		buf = (char *) malloc(stat.st_size);
+	else
+		buf = NULL;
+	if (buf == NULL)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Could not allocate enough memory to read file \"%s\".",
+						   PG_TRACING_TEXT_FILE)));
+		CloseTransientFile(fd);
+		return NULL;
+	}
+
+	/*
+	 * OK, slurp in the file.  Windows fails if we try to read more than
+	 * INT_MAX bytes at once, and other platforms might not like that either,
+	 * so read a very large file in 1GB segments.
+	 */
+	nread = 0;
+	while (nread < stat.st_size)
+	{
+		int			toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
+
+		/*
+		 * If we get a short read and errno doesn't get set, the reason is
+		 * probably that garbage collection truncated the file since we did
+		 * the fstat(), so we don't log a complaint --- but we don't return
+		 * the data, either, since it's most likely corrupt due to concurrent
+		 * writes from garbage collection.
+		 */
+		errno = 0;
+		if (read(fd, buf + nread, toread) != toread)
+		{
+			if (errno)
+				ereport(LOG,
+						(errcode_for_file_access(),
+						 errmsg("could not read file \"%s\": %m",
+								PG_TRACING_TEXT_FILE)));
+			free(buf);
+			CloseTransientFile(fd);
+			return NULL;
+		}
+		nread += toread;
+	}
+
+	if (CloseTransientFile(fd) != 0)
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", PG_TRACING_TEXT_FILE)));
+
+	*buffer_size = nread;
+	return buf;
+}
diff --git a/contrib/pg_tracing/query_process.h b/contrib/pg_tracing/query_process.h
new file mode 100644
index 0000000000..04328b8cf2
--- /dev/null
+++ b/contrib/pg_tracing/query_process.h
@@ -0,0 +1,30 @@
+#ifndef _QUERY_PROCESS_H_
+#define _QUERY_PROCESS_H_
+
+#include "pg_tracing.h"
+#include "nodes/queryjumble.h"
+#include "parser/parse_node.h"
+
+/*
+ * Normalise query: - Comments are removed - Constants are replaced by $x -
+ * All tokens are separated by a single space
+ */
+extern const char *normalise_query_parameters(const JumbleState *jstate, const char *query,
+											  int query_loc, int *query_len_p, char **paramStr);
+
+extern void extract_traceparent(pgTracingTrace * pgTracingTrace, const char *query_str, bool is_parameter);
+
+/*
+ * Normalise simple query
+ */
+extern const char *normalise_query(const char *query, int query_loc, int *query_len_p);
+bool		text_store_file(pgTracingSharedState * pg_tracing, const char *query,
+							int query_len, Size *query_offset);
+extern const char *qtext_load_file(Size *buffer_size);
+extern const char *qtext_load_file(Size *buffer_size);
+extern void initialize_traceparent(pgTracingTrace * traceparent_parameter);
+
+/* Location of external text file */
+#define PG_TRACING_TEXT_FILE	PG_STAT_TMP_DIR "/pg_tracing.stat"
+
+#endif
diff --git a/contrib/pg_tracing/span.c b/contrib/pg_tracing/span.c
new file mode 100644
index 0000000000..b44b38ce8c
--- /dev/null
+++ b/contrib/pg_tracing/span.c
@@ -0,0 +1,295 @@
+#include "span.h"
+
+#include "nodes/extensible.h"
+#include "common/pg_prng.h"
+
+
+/*
+ * Convert a node CmdType to the matching SpanType
+ */
+SpanType
+command_type_to_span_type(CmdType cmd_type)
+{
+	switch (cmd_type)
+	{
+		case CMD_SELECT:
+			return SPAN_NODE_SELECT;
+		case CMD_INSERT:
+			return SPAN_NODE_INSERT;
+		case CMD_UPDATE:
+			return SPAN_NODE_UPDATE;
+		case CMD_DELETE:
+			return SPAN_NODE_DELETE;
+		case CMD_MERGE:
+			return SPAN_NODE_MERGE;
+		case CMD_UTILITY:
+			return SPAN_NODE_UTILITY;
+		case CMD_NOTHING:
+			return SPAN_NODE_NOTHING;
+		case CMD_UNKNOWN:
+			return SPAN_NODE_UNKNOWN;
+	}
+	return SPAN_NODE_UNKNOWN;
+}
+
+/*
+ * Initialize span fields
+ */
+static void
+initialize_span_fields(Span * span, SpanType type, const pgTracingTrace * trace, uint64 parent_id, uint64 query_id, int nested_level)
+{
+	span->trace_id = trace->trace_id;
+	span->type = type;
+
+	/*
+	 * If parent id is unset, it means that there's no propagated trace
+	 * informations from the caller. In this case, this is the top span,
+	 * span_id == parent_id and we reuse the generated trace id for the span
+	 * id.
+	 */
+	if (parent_id == 0)
+	{
+		span->parent_id = trace->trace_id;
+		span->span_id = trace->trace_id;
+	}
+	else
+	{
+		span->parent_id = parent_id;
+		span->span_id = pg_prng_uint64(&pg_global_prng_state);
+	}
+	span->name_offset = -1;
+	span->operation_name_offset = -1;
+	span->parameter_offset = -1;
+	span->sql_error_code = 0;
+	span->startup = 0;
+	span->be_pid = MyProcPid;
+	span->subxact_count = MyProc->subxidStatus.count;
+	span->start_ns_time = 0;
+	span->duration_ns = 0;
+	span->ended = false;
+	span->is_top_span = false;
+	span->query_id = query_id;
+	span->nested_level = nested_level;
+	memset(&span->node_counters, 0, sizeof(NodeCounters));
+	memset(&span->plan_counters, 0, sizeof(PlanCounters));
+
+	/*
+	 * Store the starting buffer for planner and process utility spans
+	 *
+	 * TODO: Do we need wal buffer for planner?
+	 */
+	if (type == SPAN_PLANNER || span->type == SPAN_PROCESS_UTILITY)
+	{
+		/* TODO Do we need to track wal for planner? */
+		span->node_counters.buffer_usage = pgBufferUsage;
+		span->node_counters.wal_usage = pgWalUsage;
+	}
+}
+
+/*
+ * Initialize span and set span starting time.
+ *
+ * A span needs a start timestamp and a duration.
+ * Given that some spans can have a very short duration (less than 1us), we
+ * need to rely on monotonic clock as much as possible to have the best precision.
+ *
+ * For that, after we've established that a query was sampled, we get both the wall clock
+ * and the monotonic clock at the start of a trace. They are both stored in pgTracingTrace.
+ * All subsequent times will use the monotonic clock.
+ * When we need to get the start of a span, we use the starting wall clock as a starting point
+ * and add the time between the two monotonic clocks.
+ *
+ * start_span is an optional nanosecond time to set for. If nothing is provided, we use the current monotonic clock.
+ * This parameter is mostly used when generating spans from planstate as we need to rely on the query instrumentation to find the node start.
+ */
+void
+begin_span(Span * span, SpanType type, const pgTracingTrace * trace, uint64 parent_id, uint64 query_id, const int64 *start_span, int nested_level)
+{
+	int64		start_ns_time;
+	int64		ns_since_trace_start;
+
+	initialize_span_fields(span, type, trace, parent_id, query_id, nested_level);
+
+	/* If no start span is provided, get the current one */
+	if (start_span == NULL)
+		start_ns_time = get_current_ns();
+	else
+		start_ns_time = *start_span;
+
+	/* We use duration to temporarily store ns at span start ns */
+	span->start_ns_time = start_ns_time;
+
+	/* Get the ns between trace start and span start */
+	ns_since_trace_start = (start_ns_time - trace->start_trace.ns);
+
+	/* Fill span starts */
+	span->start = trace->start_trace.ts + ns_since_trace_start / NS_PER_US;
+	span->start_ns = ns_since_trace_start % NS_PER_US;
+}
+
+/*
+ * Set span duration and accumulated buffers
+ * end_span_input is optional, if NULL is passed, we use
+ * the current time
+ */
+void
+end_span(Span * span, const int64 *end_span_input)
+{
+	BufferUsage buffer_usage;
+	WalUsage	wal_usage;
+	int64		end_span_ns;
+
+	Assert(span->start_ns_time > 0);
+	Assert(span->trace_id > 0);
+
+	/* Span should be ended only once */
+	Assert(span->ended == false);
+	span->ended = true;
+
+	/* Set span duration with the end time before substrating the start */
+	if (end_span_input == NULL)
+		end_span_ns = get_current_ns();
+	else
+		end_span_ns = *end_span_input;
+	span->duration_ns = end_span_ns - span->start_ns_time;
+
+	if (span->type == SPAN_PLANNER || span->type == SPAN_PROCESS_UTILITY)
+	{
+		/* calc differences of buffer counters. */
+		memset(&buffer_usage, 0, sizeof(BufferUsage));
+		BufferUsageAccumDiff(&buffer_usage, &pgBufferUsage,
+							 &span->node_counters.buffer_usage);
+		span->node_counters.buffer_usage = buffer_usage;
+
+		/* calc differences of WAL counters. */
+		memset(&wal_usage, 0, sizeof(wal_usage));
+		WalUsageAccumDiff(&wal_usage, &pgWalUsage,
+						  &span->node_counters.wal_usage);
+		span->node_counters.wal_usage = wal_usage;
+	}
+}
+
+/*
+ * Get the name of a span.
+ * If it is a node span, the name may be pulled from the stat file.
+ */
+const char *
+get_span_name(const Span * span, const char *qbuffer, Size qbuffer_size)
+{
+	if (span->name_offset != -1 && qbuffer_size > 0 && span->name_offset <= qbuffer_size)
+		return qbuffer + span->name_offset;
+
+	switch (span->type)
+	{
+		case SPAN_POST_PARSE:
+			return "Post Parse";
+		case SPAN_PARSE:
+			return "Parse";
+		case SPAN_PLANNER:
+			return "Planner";
+		case SPAN_FUNCTION:
+			return "Function";
+		case SPAN_PROCESS_UTILITY:
+			return "ProcessUtility";
+		case SPAN_EXECUTOR_START:
+			return "Executor";
+		case SPAN_EXECUTOR_RUN:
+			return "Executor";
+		case SPAN_EXECUTOR_END:
+			return "Executor";
+		case SPAN_EXECUTOR_FINISH:
+			return "Executor";
+
+		case SPAN_NODE_INIT_PLAN:
+			return "InitPlan";
+		case SPAN_NODE_SUBPLAN:
+			return "SubPlan";
+
+		case SPAN_NODE_SELECT:
+			return "Select";
+		case SPAN_NODE_INSERT:
+			return "Insert";
+		case SPAN_NODE_UPDATE:
+			return "Update";
+		case SPAN_NODE_DELETE:
+			return "Delete";
+		case SPAN_NODE_MERGE:
+			return "Merge";
+		case SPAN_NODE_UTILITY:
+			return "Utility";
+		case SPAN_NODE_NOTHING:
+			return "Nothing";
+		case SPAN_NODE_UNKNOWN:
+			return "Unknown";
+		case SPAN_NODE:
+			return "Node";
+	}
+	return "???";
+}
+
+/*
+ * Get the operation of a span.
+ * For node span, the name may be pulled from the stat file.
+ */
+const char *
+get_operation_name(const Span * span, const char *qbuffer, Size qbuffer_size)
+{
+	if (span->operation_name_offset != -1 && qbuffer_size > 0 && span->operation_name_offset <= qbuffer_size)
+		return qbuffer + span->operation_name_offset;
+
+	switch (span->type)
+	{
+		case SPAN_POST_PARSE:
+			return "Post Parse";
+		case SPAN_PARSE:
+			return "Parse";
+		case SPAN_PLANNER:
+			return "Planner";
+		case SPAN_FUNCTION:
+			return "Function";
+		case SPAN_PROCESS_UTILITY:
+			return "ProcessUtility";
+		case SPAN_EXECUTOR_START:
+			return "Start";
+		case SPAN_EXECUTOR_RUN:
+			return "Run";
+		case SPAN_EXECUTOR_END:
+			return "End";
+		case SPAN_EXECUTOR_FINISH:
+			return "Finish";
+		case SPAN_NODE_SELECT:
+		case SPAN_NODE_INSERT:
+		case SPAN_NODE_UPDATE:
+		case SPAN_NODE_DELETE:
+		case SPAN_NODE_MERGE:
+		case SPAN_NODE_UTILITY:
+		case SPAN_NODE_NOTHING:
+		case SPAN_NODE_INIT_PLAN:
+		case SPAN_NODE_SUBPLAN:
+		case SPAN_NODE_UNKNOWN:
+		case SPAN_NODE:
+			return "Node";
+	}
+	return "Unknown type";
+}
+
+/*
+* Get the current nanoseconds value
+*/
+int64
+get_current_ns(void)
+{
+	instr_time	t;
+
+	INSTR_TIME_SET_CURRENT(t);
+	return INSTR_TIME_GET_NANOSEC(t);
+}
+
+/*
+* Get the end of a span in nanoseconds
+*/
+int64
+get_span_end_ns(const Span * span)
+{
+	return span->start_ns_time + span->duration_ns;
+}
diff --git a/contrib/pg_tracing/span.h b/contrib/pg_tracing/span.h
new file mode 100644
index 0000000000..e9df64862f
--- /dev/null
+++ b/contrib/pg_tracing/span.h
@@ -0,0 +1,159 @@
+/*-------------------------------------------------------------------------
+ * contrib/pg_tracing/span.h
+ *
+ * 	Header for span.
+ *
+ * IDENTIFICATION
+ *	  contrib/pg_tracing/span.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _SPAN_H_
+#define _SPAN_H_
+
+#include "postgres.h"
+
+#include "jit/jit.h"
+#include "pgstat.h"
+#include "access/transam.h"
+
+/*
+ * SpanType: Type of the span
+ */
+typedef enum
+{
+	SPAN_PARSE,					/* Wraps query parsing */
+	SPAN_POST_PARSE,			/* Wraps post parsing */
+	SPAN_PLANNER,				/* Wraps planner execution in planner hook */
+	SPAN_FUNCTION,				/* Wraps function in fmgr hook */
+	SPAN_PROCESS_UTILITY,		/* Wraps ProcessUtility execution */
+
+	SPAN_EXECUTOR_START,		/* Executor Spans wrapping the matching hooks */
+	SPAN_EXECUTOR_RUN,
+	SPAN_EXECUTOR_END,
+	SPAN_EXECUTOR_FINISH,
+
+	SPAN_NODE_INIT_PLAN,
+	SPAN_NODE_SUBPLAN,
+
+	SPAN_NODE,					/* Represents a node execution, generated from
+								 * planstate */
+
+	SPAN_NODE_SELECT,			/* Query Span types. They are created from the
+								 * query cmdType */
+	SPAN_NODE_INSERT,
+	SPAN_NODE_UPDATE,
+	SPAN_NODE_DELETE,
+	SPAN_NODE_MERGE,
+	SPAN_NODE_UTILITY,
+	SPAN_NODE_NOTHING,
+	SPAN_NODE_UNKNOWN,
+}			SpanType;
+
+
+/*
+ * Counters extracted from query instrumentation
+ */
+typedef struct NodeCounters
+{
+	int64		rows;			/* # of tuples processed */
+	int64		nloops;			/* # of cycles for this node */
+
+	BufferUsage buffer_usage;	/* total buffer usage for this node */
+	WalUsage	wal_usage;		/* total WAL usage for this node */
+	JitInstrumentation jit_usage;	/* total JIT usage for this node */
+}			NodeCounters;
+
+/*
+ * Counters extracted from query's plan
+ */
+typedef struct PlanCounters
+{
+	/*
+	 * estimated execution costs for plan (see costsize.c for more info)
+	 */
+	Cost		startup_cost;	/* cost expended before fetching any tuples */
+	Cost		total_cost;		/* total cost (assuming all tuples fetched) */
+
+	/*
+	 * planner's estimate of result size of this plan step
+	 */
+	Cardinality plan_rows;		/* number of rows plan is expected to emit */
+	int			plan_width;		/* average row width in bytes */
+}			PlanCounters;
+
+/*
+ * Start time of a trace. We store both wall clock and monotonic clock.
+ */
+typedef struct
+{
+	TimestampTz ts;				/* Wall clock at the start of the trace */
+	int64		ns;				/* Monotonic clock at the start of the trace */
+}			pgTracingStartTime;
+
+typedef struct pgTracingTrace
+{
+	uint64		trace_id;		/* Id of the trace */
+	uint64		parent_id;		/* Span id of the parent */
+	int			sampled;		/* Is current statement sampled? */
+	uint64		query_id;		/* Query id of the current statement */
+	pgTracingStartTime start_trace;
+}			pgTracingTrace;
+
+/*
+ * The Span data structure represents an operation with a start, a duration
+ * and metadatas.
+ */
+typedef struct Span
+{
+	uint64		trace_id;		/* Trace id extracted from the SQLCommenter's
+								 * traceparent */
+	uint64		span_id;		/* Span Identifier generated from a random
+								 * uint64 */
+	uint64		parent_id;		/* Span's parent id. For the top span, it's
+								 * extracted from SQLCommenter's traceparent.
+								 * For other spans, we pass the parent's span. */
+	uint64		query_id;		/* QueryId of the trace query if available */
+
+	TimestampTz start;			/* Start of the span. */
+	uint16		start_ns;		/* Leftover nanoseconds of span start */
+	int64		start_ns_time;	/* Nanoseconds at start of the span. */
+
+	int64		duration_ns;	/* Duration of the span in nanoseconds. */
+	SpanType	type;			/* Type of the span. Used to generate the
+								 * span's name for all spans except SPAN_NODE. */
+	int			be_pid;			/* Pid of the backend process */
+	bool		is_top_span;
+	bool		ended;			/* Track if the span was already ended */
+	uint8		nested_level;	/* Only used for debugging */
+	uint8		subxact_count;	/* Active count of backend's subtransaction */
+
+	/*
+	 * We store variable size metadata in an external file. Those represent
+	 * the position of NULL terminated strings in the file. Set to -1 if
+	 * unused.
+	 */
+	Size		name_offset;	/* span name offset in external file */
+	Size		operation_name_offset;	/* operation name offset in external
+										 * file */
+	Size		parameter_offset;	/* parameters offset in external file */
+
+	PlanCounters plan_counters; /* Counters with plan costs */
+	NodeCounters node_counters; /* Counters with node costs (jit, wal,
+								 * buffers) */
+	int64		startup;		/* Time to the first tuple */
+	int			sql_error_code; /* query error code extracted from ErrorData,
+								 * 0 if query was successful */
+}			Span;
+
+extern void begin_span(Span * span, SpanType type, const pgTracingTrace * trace, uint64 parent_id, uint64 query_id, const int64 *start_span, int nested_level);
+extern void end_span(Span * span, const int64 *end_time);
+
+extern SpanType command_type_to_span_type(CmdType cmd_type);
+extern const char *get_span_name(const Span * span, const char *qbuffer, Size qbuffer_size);
+extern const char *get_operation_name(const Span * span, const char *qbuffer, Size qbuffer_size);
+
+extern int64 get_current_ns(void);
+extern int64 get_span_end_ns(const Span * span);
+
+#endif
diff --git a/contrib/pg_tracing/sql/cleanup.sql b/contrib/pg_tracing/sql/cleanup.sql
new file mode 100644
index 0000000000..beb88da53f
--- /dev/null
+++ b/contrib/pg_tracing/sql/cleanup.sql
@@ -0,0 +1,4 @@
+-- Drop test table and test functions
+DROP TABLE pg_tracing_test;
+DROP function test_function;
+DROP EXTENSION pg_tracing;
diff --git a/contrib/pg_tracing/sql/nested.sql b/contrib/pg_tracing/sql/nested.sql
new file mode 100644
index 0000000000..bd922b44c4
--- /dev/null
+++ b/contrib/pg_tracing/sql/nested.sql
@@ -0,0 +1,102 @@
+-- Create test function to sample
+CREATE OR REPLACE FUNCTION test_function(a int) RETURNS SETOF oid AS
+$BODY$
+BEGIN
+	RETURN QUERY SELECT oid from pg_class where oid = a;
+END;
+$BODY$
+LANGUAGE plpgsql;
+
+-- Trace a statement with a function call
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000051-0000000000000051-01'*/ select test_function(1);
+
+-- The test function call will generate the following spans (non exhaustive list):
+-- +-----------------------------------------------------------------------------------------------+
+-- | A: Select test_function(1);                                                                   |
+-- +-+--------++----------++----------------++---------------------------------------------------+-+
+--   |B: Parse||C: Planner||D: ExecutorStart||E: ExecutorRun                                     |
+--   +--------++----------++----------------+++-------------------------------------------------++
+--                                            |F: ProjectSet                                    |
+--                                            ++---------+------+------------------------------++
+--                                             |G: Result|      | H: Select a from b where...  |
+--                                             +---------+      +----+--------------+----------+
+--                                                                   |I: ExecutorRun|
+--                                                                   +--------------+
+
+-- Gather span_id, span start and span end of function call statement
+SELECT span_id AS span_a_id,
+        get_span_start_ns(span_start, span_start_ns) as span_a_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_a_end
+		from pg_tracing_peek_spans where parent_id=81 and name!='Parse' \gset
+SELECT span_id AS span_e_id,
+        get_span_start_ns(span_start, span_start_ns) as span_e_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_e_end
+		from pg_tracing_peek_spans where parent_id=:span_a_id and name='Executor' and resource='Run' \gset
+SELECT span_id AS span_f_id,
+        get_span_start_ns(span_start, span_start_ns) as span_f_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_f_end
+		from pg_tracing_peek_spans where parent_id=:span_e_id and name='ProjectSet' \gset
+SELECT span_id AS span_g_id,
+        get_span_start_ns(span_start, span_start_ns) as span_g_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_g_end
+		from pg_tracing_peek_spans where parent_id=:span_f_id and name='Result' \gset
+SELECT span_id AS span_h_id,
+        get_span_start_ns(span_start, span_start_ns) as span_h_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_h_end
+		from pg_tracing_peek_spans where parent_id=:span_f_id and name='Select' \gset
+SELECT span_id AS span_i_id,
+        get_span_start_ns(span_start, span_start_ns) as span_i_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_i_end
+		from pg_tracing_peek_spans where parent_id=:span_h_id and resource='Run' \gset
+
+-- Check that spans' start and end are within expection
+SELECT :span_a_start < :span_e_start AS top_query_before_run,
+		:span_a_end >= :span_e_end AS top_end_after_run_end,
+
+		:span_e_start <= :span_f_start AS top_run_start_before_project,
+
+		:span_e_end >= :span_f_end AS top_run_end_after_project_end,
+		:span_e_end >= :span_h_end AS top_run_end_before_select_end,
+		:span_e_end >= :span_i_end AS top_run_end_after_nested_run_end,
+
+		:span_i_end <= :span_h_end AS run_end_after_select_end;
+
+-- Check that the root span is the longest one
+WITH max_duration AS (select max(duration) from pg_tracing_peek_spans)
+SELECT duration = max_duration.max from pg_tracing_peek_spans, max_duration
+    where span_id = :span_a_id;
+
+-- Check that ExecutorRun is attached to the nested top span
+SELECT resource from pg_tracing_peek_spans where parent_id=:span_i_id order by resource;
+
+-- Check tracking with top tracking
+set pg_tracing.track = 'top';
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000052-0000000000000052-01'*/ select test_function(1);
+SELECT count(*) from pg_tracing_consume_spans where trace_id=82;
+
+-- Check tracking with no tracking
+set pg_tracing.track = 'none';
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000053-0000000000000053-01'*/ select test_function(1);
+SELECT count(*) from pg_tracing_consume_spans where trace_id=83;
+
+-- Reset tracking setting
+set pg_tracing.track = 'all';
+
+-- Create test procedure
+CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
+DECLARE
+  r int;
+BEGIN
+  SELECT (i + i)::int INTO r;
+END; $$ LANGUAGE plpgsql;
+
+-- Test tracking of procedure with utility tracking enabled
+set pg_tracing.track_utility=on;
+/*traceparent='00-00000000000000000000000000000054-0000000000000054-01'*/ CALL sum_one(3);
+SELECT resource from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+
+
+-- Test again with utility tracking disabled
+set pg_tracing.track_utility=off;
+/*traceparent='00-00000000000000000000000000000055-0000000000000055-01'*/ CALL sum_one(10);
+SELECT resource from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
diff --git a/contrib/pg_tracing/sql/parallel.sql b/contrib/pg_tracing/sql/parallel.sql
new file mode 100644
index 0000000000..0390a20b33
--- /dev/null
+++ b/contrib/pg_tracing/sql/parallel.sql
@@ -0,0 +1,16 @@
+begin;
+-- encourage use of parallel plans
+set parallel_setup_cost=0;
+set parallel_tuple_cost=0;
+set min_parallel_table_scan_size=0;
+set max_parallel_workers_per_gather=2;
+
+-- Trace parallel queries
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ select 1 from pg_class limit 1;
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ select 2 from pg_class limit 2;
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-00'*/ select 3 from pg_class limit 3;
+commit;
+
+-- Check order and number of spans generated by parallel queries
+-- We can't order by span start due to the parallelisation introducing randomness
+SELECT name, resource from pg_tracing_consume_spans order by resource;
diff --git a/contrib/pg_tracing/sql/reset.sql b/contrib/pg_tracing/sql/reset.sql
new file mode 100644
index 0000000000..531db2172d
--- /dev/null
+++ b/contrib/pg_tracing/sql/reset.sql
@@ -0,0 +1,3 @@
+-- Check reset is working
+SELECT pg_tracing_reset();
+SELECT traces from pg_tracing_info;
diff --git a/contrib/pg_tracing/sql/sample.sql b/contrib/pg_tracing/sql/sample.sql
new file mode 100644
index 0000000000..bdfb59ad70
--- /dev/null
+++ b/contrib/pg_tracing/sql/sample.sql
@@ -0,0 +1,41 @@
+-- Trace nothing
+SET pg_tracing.sample_rate = 0.0;
+SET pg_tracing.caller_sample_rate = 0.0;
+
+-- Query with sampling flag
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000021-0000000000000021-01'*/ SELECT 1;
+-- Query without trace context
+SELECT 1;
+
+-- No spans should have been generated
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+
+-- Enable full sampling
+SET pg_tracing.sample_rate = 1.0;
+
+-- Generate queries with sampling flag on, off and no trace context
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000022-0000000000000022-01'*/ SELECT 1;
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000023-0000000000000023-00'*/ SELECT 2;
+SELECT 3;
+SELECT 4;
+
+-- Check number of generated spans
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+-- Check span order
+select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns;
+-- Top spans should reuse generated ids and have trace_id = parent_id = span_id
+select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns;
+
+-- Only trace query with sampled flag
+SET pg_tracing.sample_rate = 0.0;
+SET pg_tracing.caller_sample_rate = 1.0;
+
+-- Generate queries with sampling flag on, off and no trace context
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000024-0000000000000024-01'*/ SELECT 1;
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000025-0000000000000025-00'*/ SELECT 2;
+SELECT 1;
+
+-- Check number of generated spans
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+-- Check span ordering
+select resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns;
diff --git a/contrib/pg_tracing/sql/select.sql b/contrib/pg_tracing/sql/select.sql
new file mode 100644
index 0000000000..9b8d6b5fb0
--- /dev/null
+++ b/contrib/pg_tracing/sql/select.sql
@@ -0,0 +1,81 @@
+-- Only trace queries with sample flag
+SET pg_tracing.sample_rate = 0.0;
+SET pg_tracing.caller_sample_rate = 1.0;
+
+-- Run a simple query
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ SELECT 1;
+
+-- Get top span id
+SELECT span_id AS top_span_id from pg_tracing_peek_spans where parent_id=1 and name!='Parse' \gset
+
+-- Check parameters
+SELECT parameters from pg_tracing_peek_spans where span_id=:top_span_id;
+
+-- Check the number of children
+SELECT count(*) from pg_tracing_peek_spans where parent_id=:'top_span_id';
+
+-- Check resource and query id
+SELECT name, resource, query_id from pg_tracing_peek_spans where trace_id=1 order by span_start, span_start_ns, resource;
+
+-- Check reported number of trace
+SELECT traces from pg_tracing_info;
+
+-- Trace a statement with function call
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ SELECT count(*) from current_database();
+
+-- Check the generated span name, resource and order of function call
+SELECT name, resource from pg_tracing_consume_spans where trace_id=3 order by resource;
+
+-- Trace a more complex query with multiple function calls
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000004-0000000000000004-01'*/ SELECT s.relation_size + s.index_size
+FROM (SELECT
+      pg_relation_size(C.oid) as relation_size,
+      pg_indexes_size(C.oid) as index_size
+    FROM pg_class C) as s limit 1;
+
+-- Check the generated span name, resource and order of query with multiple function calls
+SELECT name, resource from pg_tracing_consume_spans where trace_id=4 order by resource;
+
+-- Check that we're in a correct state after a timeout
+set statement_timeout=200;
+
+-- Trace query triggering a statement timeout
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000007-0000000000000007-01'*/ select * from pg_sleep(10);
+-- Trace a working query after the timeout to check we're in a consistent state
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000008-0000000000000008-01'*/ select 1;
+
+-- Check the spans order and error code
+SELECT name, resource, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+
+-- Cleanup statement setting
+set statement_timeout=0;
+
+-- Create a prepare statement with comment passed as first parameter
+PREPARE test_prepared (text, integer) AS /*$1*/ SELECT $2;
+-- Execute prepare statement with trace context passed as a parameter
+EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000009-0000000000000009-01''', 1);
+
+-- Check generated spans and order
+SELECT trace_id, name, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+
+-- Test prepared statement with generic plan
+SET plan_cache_mode='force_generic_plan';
+-- Execute prepare statement with trace context passed as a parameter and generic plan
+EXECUTE test_prepared('dddbs=''postgres.db'',traceparent=''00-00000000000000000000000000000010-0000000000000010-01''', 10);
+
+-- Check spans are generated even through generic plan
+SELECT trace_id, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+
+-- Run a statement with node not executed
+/*dddbs='postgres.db',traceparent='00-0000000000000000000000000000000a-000000000000000a-01'*/ select 1 limit 0;
+-- Not executed node should not generate any spans
+SELECT trace_id, resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+
+-- Test multiple statements in a single query
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000011-0000000000000012-01'*/ select 1; select 2;
+-- Not executed node should not generate any spans
+select resource, parameters from pg_tracing_consume_spans order by span_start, span_start_ns;
+
+-- Cleanup
+SET plan_cache_mode='auto';
+DEALLOCATE test_prepared;
diff --git a/contrib/pg_tracing/sql/subxact.sql b/contrib/pg_tracing/sql/subxact.sql
new file mode 100644
index 0000000000..89082f523b
--- /dev/null
+++ b/contrib/pg_tracing/sql/subxact.sql
@@ -0,0 +1,18 @@
+-- Create test table
+CREATE TABLE IF NOT EXISTS pg_tracing_test (a int, b char(20));
+
+-- Enable full sampling
+SET pg_tracing.sample_rate = 1.0;
+
+-- Start a transaction with subxaction
+BEGIN;
+SAVEPOINT s1;
+INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa');
+SAVEPOINT s2;
+INSERT INTO pg_tracing_test VALUES(generate_series(1, 2), 'aaa');
+SAVEPOINT s3;
+SELECT 1;
+COMMIT;
+
+-- Check that subxact_count is correctly reported
+select resource, parameters, subxact_count from pg_tracing_consume_spans order by span_start, span_start_ns;
diff --git a/contrib/pg_tracing/sql/utility.sql b/contrib/pg_tracing/sql/utility.sql
new file mode 100644
index 0000000000..7415232e52
--- /dev/null
+++ b/contrib/pg_tracing/sql/utility.sql
@@ -0,0 +1,128 @@
+-- Some utility functions
+CREATE OR REPLACE FUNCTION get_span_start_ns(time_start timestamptz, start_ns smallint) RETURNS bigint AS
+$BODY$
+BEGIN
+    RETURN extract(epoch from time_start) * 1000000000 + start_ns;
+END;
+$BODY$
+LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION get_span_end_ns(time_start timestamptz, start_ns smallint, duration bigint) RETURNS bigint AS
+$BODY$
+BEGIN
+    RETURN get_span_start_ns(time_start, start_ns) + duration;
+END;
+$BODY$
+LANGUAGE plpgsql;
+
+-- Create pg_tracing extension with sampling on
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing;
+
+-- This will create the following spans (non exhaustive list):
+--
+-- +-------------------------------------------------------------------------------------+
+-- | A: Utility: Create Extension                                                        |
+-- +-+-----------------------------------------------------------------------------------+
+--   +----------------------------------------------------------------------------------+
+--   |B: ProcessUtility: Create Extension                                               |
+--   +---+-----------------------------------+---+--------------------------------------+
+--       +-----------------------------------+   +-------------------------------------+
+--       |C: Utility: Create Function1       |   |E: Utility: Create Function2         |
+--       ++----------------------------------+   ++-----------------------------------++
+--        +----------------------------------+    +-----------------------------------+
+--        |D: ProcessUtility: Create Function|    |F: ProcessUtility: Create Function2|
+--        +----------------------------------+    +-----------------------------------+
+
+-- Extract span_ids, start and end of those spans
+SELECT span_id AS span_a_id,
+        get_span_start_ns(span_start, span_start_ns) as span_a_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_a_end
+		from pg_tracing_peek_spans where parent_id=1 and name='Utility' \gset
+
+SELECT span_id AS span_b_id,
+        get_span_start_ns(span_start, span_start_ns) as span_b_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_b_end
+		from pg_tracing_peek_spans where parent_id=:span_a_id and name='ProcessUtility' \gset
+
+SELECT span_id AS span_c_id,
+        get_span_start_ns(span_start, span_start_ns) as span_c_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_c_end
+		from pg_tracing_peek_spans where parent_id=:span_b_id and name='Utility' limit 1 \gset
+
+SELECT span_id AS span_d_id,
+        get_span_start_ns(span_start, span_start_ns) as span_d_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_d_end
+		from pg_tracing_peek_spans where parent_id=:span_c_id and name='ProcessUtility' \gset
+
+SELECT span_id AS span_e_id,
+        get_span_start_ns(span_start, span_start_ns) as span_e_start,
+        get_span_end_ns(span_start, span_start_ns, duration) as span_e_end
+		from pg_tracing_peek_spans where parent_id=:span_b_id and name='Utility' limit 1 offset 1 \gset
+
+-- Check that the start and end of those spans are within expectation
+SELECT :span_a_start < :span_b_start AS span_a_start_first,
+		:span_a_end >= :span_b_end AS span_a_end_last,
+
+		:span_d_end <= :span_c_end AS nested_span_ends_before_parent,
+		:span_c_end <= :span_e_start AS next_utility_start_after;
+
+-- Clean current spans
+select count(*) from pg_tracing_consume_spans;
+
+--
+-- Test that no utility is captured with track_utility off
+--
+
+-- Set utility off
+SET pg_tracing.track_utility = off;
+
+-- Prepare and execute a prepared statement
+PREPARE test_prepared_one_param (integer) AS SELECT $1;
+EXECUTE test_prepared_one_param(100);
+
+-- Nothing should be generated
+select count(*) from pg_tracing_consume_spans;
+
+-- Force a query to start from ExecutorRun
+SET plan_cache_mode='force_generic_plan';
+EXECUTE test_prepared_one_param(200);
+
+-- Again, nothing should be generated
+select count(*) from pg_tracing_consume_spans;
+
+--
+-- Test that no utility is captured with track_utility off
+--
+
+-- Enable utility tracking and track everything
+SET pg_tracing.track_utility = on;
+SET pg_tracing.sample_rate = 1.0;
+
+-- Prepare and execute a prepared statement
+PREPARE test_prepared_one_param_2 (integer) AS SELECT $1;
+EXECUTE test_prepared_one_param_2(100);
+
+-- Check the number of generated spans
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+-- Check that the spans are in correct order
+select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns;
+-- Check the top span (standalone top span has trace_id=parent_id=span_id)
+select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns;
+
+-- Test prepared statement with generic plan
+SET plan_cache_mode='force_generic_plan';
+EXECUTE test_prepared_one_param(200);
+
+-- Check the number of generated spans
+select count(distinct(trace_id)) from pg_tracing_peek_spans;
+-- Check that the spans are in correct order
+select resource, parameters from pg_tracing_peek_spans order by span_start, span_start_ns;
+-- Check the top span (standalone top span has trace_id=parent_id=span_id)
+select resource, parameters from pg_tracing_consume_spans where trace_id = parent_id and parent_id = span_id order by span_start, span_start_ns;
+
+-- Second create extension should generate an error that is captured by span
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ CREATE EXTENSION pg_tracing;
+select resource, parameters, sql_error_code from pg_tracing_consume_spans order by span_start, span_start_ns;
+
+-- Cleanup
+SET plan_cache_mode='auto';
diff --git a/contrib/pg_tracing/sql/wal.sql b/contrib/pg_tracing/sql/wal.sql
new file mode 100644
index 0000000000..c83b287ee2
--- /dev/null
+++ b/contrib/pg_tracing/sql/wal.sql
@@ -0,0 +1,32 @@
+-- Create test table
+CREATE TABLE IF NOT EXISTS pg_tracing_test (a int, b char(20));
+
+-- Enable wal instrumentation
+set pg_tracing.instrument_wal = true;
+
+-- Generate queries with wal write
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa');
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7;
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9;
+
+-- Check WAL is generated for the above statements
+SELECT trace_id, name, resource,
+       wal_records > 0 as wal_records,
+       wal_bytes > 0 as wal_bytes
+FROM pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+
+-- Redo the same but without wal instrumentation
+set pg_tracing.instrument_wal = false;
+
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000001-0000000000000001-01'*/ INSERT INTO pg_tracing_test VALUES(generate_series(1, 10), 'aaa');
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000002-0000000000000002-01'*/ UPDATE pg_tracing_test SET b = 'bbb' WHERE a > 7;
+/*dddbs='postgres.db',traceparent='00-00000000000000000000000000000003-0000000000000003-01'*/ DELETE FROM pg_tracing_test WHERE a > 9;
+
+-- With wal instrumentation disabled, wal counters should be set to 0
+SELECT trace_id, name, resource,
+       wal_records = 0 as wal_records,
+       wal_bytes = 0 as wal_bytes
+FROM pg_tracing_consume_spans order by span_start, span_start_ns, resource;
+
+-- Cleanup
+set pg_tracing.instrument_wal = true;
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index ee78a5749d..1174e2450f 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -70,6 +70,7 @@ InstrStartNode(Instrumentation *instr)
 	if (instr->need_timer &&
 		!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
 		elog(ERROR, "InstrStartNode called twice in a row");
+	INSTR_TIME_SET_CURRENT_LAZY(instr->firsttime);
 
 	/* save buffer usage totals at node entry, if needed */
 	if (instr->need_bufusage)
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 87e5e2183b..13840be1d7 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -72,6 +72,7 @@ typedef struct Instrumentation
 	bool		async_mode;		/* true if node is in async mode */
 	/* Info about current plan cycle: */
 	bool		running;		/* true if we've completed first tuple */
+	instr_time	firsttime;		/* start time of this node */
 	instr_time	starttime;		/* start time of current iteration of node */
 	instr_time	counter;		/* accumulated runtime for this node */
 	double		firsttuple;		/* time for first tuple of this cycle */
-- 
2.39.3 (Apple Git-145)