From ab4110fe6bcff362e492ddafccbf495cad14165c Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Wed, 6 May 2026 11:17:27 +0500 Subject: [PATCH vAB2 2/3] Add regression test for ProcKill lock-group procLatch recycle race Two backends form a lock group via prockill_become_lock_group_leader / prockill_become_lock_group_member (a new test module), then are terminated concurrently. Injection points placed in ProcKill() right after LWLockRelease(leader_lwlock) pause both victims there, letting the test verify that a freshly forked backend can claim the recycled PGPROC slot without hitting "latch already owned by PID ..." PANIC. Standard observability (pg_stat_activity, BackendPidGetProc) is unavailable at that point in ProcKill: pgstat_beshutdown_hook and RemoveProcFromArray, both registered as on_shmem_exit callbacks, run before ProcKill. The new prockill_backend_in_injection() helper reads PGPROC->wait_event_info from ProcGlobal->allProcs directly, which remains valid until ProcKill zeroes proc->pid near its end. Injection waits must be attached from a controller session rather than from the victims themselves. injection_points_attach() from the victim would register a before_shmem_exit cleanup that detaches the point before ProcKill runs. The new prockill_attach_injection_wait() calls InjectionPointAttach() directly, leaving no such hook on the victims. The test expects a fixed server and must land together with the matching ProcKill fix. Requires --enable-injection-points. Discussion: https://www.postgresql.org/message-id/d2983796-2603-41b7-a66e-fc8489ddb954@gmail.com Author: Vlad Lesin Reviewed-by: Andrey Borodin --- src/backend/storage/lmgr/proc.c | 7 ++ src/test/modules/Makefile | 2 +- src/test/modules/meson.build | 1 + src/test/modules/prockill_race/Makefile | 29 +++++ src/test/modules/prockill_race/meson.build | 36 ++++++ .../prockill_race/prockill_race--1.0.sql | 27 ++++ .../modules/prockill_race/prockill_race.c | 118 ++++++++++++++++++ .../prockill_race/prockill_race.control | 7 ++ .../t/001_prockill_lockgroup_injection.pl | 101 +++++++++++++++ 9 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 src/test/modules/prockill_race/Makefile create mode 100644 src/test/modules/prockill_race/meson.build create mode 100644 src/test/modules/prockill_race/prockill_race--1.0.sql create mode 100644 src/test/modules/prockill_race/prockill_race.c create mode 100644 src/test/modules/prockill_race/prockill_race.control create mode 100644 src/test/modules/prockill_race/t/001_prockill_lockgroup_injection.pl diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 1ac25068d62..c6b490af186 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -990,6 +990,13 @@ ProcKill(int code, Datum arg) else if (leader != MyProc) MyProc->lockGroupLeader = NULL; LWLockRelease(leader_lwlock); + /* + * Test hooks for src/test/modules/prockill_race. Synchronize + * concurrent ProcKill paths in a lock group; two names are used so + * a controller can attach a PID-scoped "wait" action per name. + */ + INJECTION_POINT("prockill-after-lockgroup-leader", NULL); + INJECTION_POINT("prockill-after-lockgroup-follower", NULL); } /* diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 0a74ab5c86f..ba80651e009 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -59,7 +59,7 @@ SUBDIRS = \ ifeq ($(enable_injection_points),yes) -SUBDIRS += injection_points gin typcache +SUBDIRS += injection_points gin typcache prockill_race else ALWAYS_SUBDIRS += injection_points gin typcache endif diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 4bca42bb370..9e013fd05af 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -13,6 +13,7 @@ subdir('libpq_pipeline') subdir('nbtree') subdir('oauth_validator') subdir('plsample') +subdir('prockill_race') subdir('spgist_name_ops') subdir('ssl_passphrase_callback') subdir('test_aio') diff --git a/src/test/modules/prockill_race/Makefile b/src/test/modules/prockill_race/Makefile new file mode 100644 index 00000000000..eb9694d5195 --- /dev/null +++ b/src/test/modules/prockill_race/Makefile @@ -0,0 +1,29 @@ +# src/test/modules/prockill_race/Makefile + +MODULE_big = prockill_race +OBJS = \ + $(WIN32RES) \ + prockill_race.o +PGFILEDESC = "prockill_race - ProcKill lock-group race test helpers" + +EXTRA_INSTALL=src/test/modules/injection_points +export enable_injection_points +TAP_TESTS = 1 + +EXTENSION = prockill_race +DATA = prockill_race--1.0.sql + +# Disabled because these tests require a temporary installation with +# injection points enabled. +NO_INSTALLCHECK = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/prockill_race +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/prockill_race/meson.build b/src/test/modules/prockill_race/meson.build new file mode 100644 index 00000000000..fbdd81a6d53 --- /dev/null +++ b/src/test/modules/prockill_race/meson.build @@ -0,0 +1,36 @@ +# Copyright (c) 2025-2026, PostgreSQL Global Development Group + +prockill_race_sources = files( + 'prockill_race.c', +) + +if host_system == 'windows' + prockill_race_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'prockill_race', + '--FILEDESC', 'prockill_race - ProcKill lock-group test helpers',]) +endif + +prockill_race = shared_module('prockill_race', + prockill_race_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += prockill_race + +test_install_data += files( + 'prockill_race.control', + 'prockill_race--1.0.sql', +) + +tests += { + 'name': 'prockill_race', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_prockill_lockgroup_injection.pl', + ], + }, +} diff --git a/src/test/modules/prockill_race/prockill_race--1.0.sql b/src/test/modules/prockill_race/prockill_race--1.0.sql new file mode 100644 index 00000000000..b9a43e70c91 --- /dev/null +++ b/src/test/modules/prockill_race/prockill_race--1.0.sql @@ -0,0 +1,27 @@ +/* prockill_race--1.0.sql */ + +\echo Use "CREATE EXTENSION prockill_race" to load this file. \quit + +CREATE FUNCTION prockill_become_lock_group_leader() +RETURNS void +AS 'MODULE_PATHNAME', 'prockill_become_lock_group_leader' +LANGUAGE C STRICT PARALLEL UNSAFE; + +CREATE FUNCTION prockill_become_lock_group_member(leader_pid integer) +RETURNS void +AS 'MODULE_PATHNAME', 'prockill_become_lock_group_member' +LANGUAGE C STRICT PARALLEL UNSAFE; + +CREATE FUNCTION prockill_attach_injection_wait(point_name text, target_pid integer) +RETURNS void +AS 'MODULE_PATHNAME', 'prockill_attach_injection_wait' +LANGUAGE C STRICT PARALLEL UNSAFE; + +-- Test-only probe: is the backend with the given PID currently waiting on the +-- named injection point? Looks directly at ProcGlobal->allProcs so it keeps +-- working while the target is blocked inside ProcKill() (after pgstat and +-- ProcArray teardown have already run). +CREATE FUNCTION prockill_backend_in_injection(target_pid integer, point_name text) +RETURNS boolean +AS 'MODULE_PATHNAME', 'prockill_backend_in_injection' +LANGUAGE C STRICT PARALLEL UNSAFE; diff --git a/src/test/modules/prockill_race/prockill_race.c b/src/test/modules/prockill_race/prockill_race.c new file mode 100644 index 00000000000..338305858f6 --- /dev/null +++ b/src/test/modules/prockill_race/prockill_race.c @@ -0,0 +1,118 @@ +/*------------------------------------------------------------------------- + * + * prockill_race.c + * SQL helpers for the prockill_race TAP test + * + * Exposes lock-group formation without parallel query and helpers that + * observe/control victim backends while they are inside ProcKill(). + * + * Copyright (c) 2025-2026, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/prockill_race/prockill_race.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "miscadmin.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/injection_point.h" +#include "utils/wait_event.h" + +#include "../injection_points/injection_point_condition.h" + +PG_MODULE_MAGIC; + +/* Read a uint32 field exactly once (matches waitfuncs.c idiom). */ +#define UINT32_ACCESS_ONCE(var) ((uint32) (*((volatile uint32 *) &(var)))) + +PG_FUNCTION_INFO_V1(prockill_become_lock_group_leader); + +Datum +prockill_become_lock_group_leader(PG_FUNCTION_ARGS) +{ + BecomeLockGroupLeader(); + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(prockill_become_lock_group_member); + +Datum +prockill_become_lock_group_member(PG_FUNCTION_ARGS) +{ + int leader_pid = PG_GETARG_INT32(0); + PGPROC *leader; + + leader = BackendPidGetProc(leader_pid); + if (leader == NULL) + elog(ERROR, "backend with PID %d not found", leader_pid); + + if (!BecomeLockGroupMember(leader, leader_pid)) + elog(ERROR, "could not join lock group of backend %d", leader_pid); + + PG_RETURN_VOID(); +} + +/* + * Attach injection_wait to point_name, scoped to target_pid via + * INJ_CONDITION_PID. + * + * Must be called from a controller session that is not one of the victims. + * Using injection_points_attach() from the victim itself would register a + * before_shmem_exit(injection_points_cleanup) hook that fires before ProcKill + * (which is on_shmem_exit) and would detach the point before the victim ever + * reaches it. Calling InjectionPointAttach() directly from a separate + * controller session leaves no such hook on the victims. + */ +PG_FUNCTION_INFO_V1(prockill_attach_injection_wait); + +Datum +prockill_attach_injection_wait(PG_FUNCTION_ARGS) +{ + char *name = text_to_cstring(PG_GETARG_TEXT_PP(0)); + int target_pid = PG_GETARG_INT32(1); + InjectionPointCondition cond = {INJ_CONDITION_PID, target_pid}; + + InjectionPointAttach(name, "injection_points", "injection_wait", + &cond, sizeof(cond)); + PG_RETURN_VOID(); +} + +/* + * Return true if the backend with target_pid is currently waiting at the + * named injection point, by reading PGPROC->wait_event_info directly. + * + * At the injection points in ProcKill(), the standard observability surfaces + * (pg_stat_activity, BackendPidGetProc) are already unavailable: both + * pgstat_beshutdown_hook and RemoveProcFromArray run as on_shmem_exit + * callbacks before ProcKill. The PGPROC slot itself remains intact until + * ProcKill zeroes proc->pid near its end, so a direct allProcs scan works. + */ +PG_FUNCTION_INFO_V1(prockill_backend_in_injection); + +Datum +prockill_backend_in_injection(PG_FUNCTION_ARGS) +{ + int target_pid = PG_GETARG_INT32(0); + char *want_name = text_to_cstring(PG_GETARG_TEXT_PP(1)); + uint32 n = ProcGlobal->allProcCount; + + for (uint32 i = 0; i < n; i++) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + const char *ev; + + if (proc->pid != target_pid) + continue; + + ev = pgstat_get_wait_event(UINT32_ACCESS_ONCE(proc->wait_event_info)); + PG_RETURN_BOOL(ev != NULL && strcmp(ev, want_name) == 0); + } + + PG_RETURN_BOOL(false); +} diff --git a/src/test/modules/prockill_race/prockill_race.control b/src/test/modules/prockill_race/prockill_race.control new file mode 100644 index 00000000000..3f8cfc8c995 --- /dev/null +++ b/src/test/modules/prockill_race/prockill_race.control @@ -0,0 +1,7 @@ +# prockill_race extension +comment = 'TAP test helpers for ProcKill lock-group / injection point race' +default_version = '1.0' +module_pathname = '$libdir/prockill_race' +relocatable = true +superuser = true +trusted = true diff --git a/src/test/modules/prockill_race/t/001_prockill_lockgroup_injection.pl b/src/test/modules/prockill_race/t/001_prockill_lockgroup_injection.pl new file mode 100644 index 00000000000..bb0779bbb88 --- /dev/null +++ b/src/test/modules/prockill_race/t/001_prockill_lockgroup_injection.pl @@ -0,0 +1,101 @@ +# Copyright (c) 2025-2026, PostgreSQL Global Development Group +# +# Regression test for the ProcKill lock-group vs. procLatch recycle race. +# +# Two backends form a lock group, then are terminated concurrently. The test +# uses injection points placed inside ProcKill() to pause both victims there +# and verify that a freshly forked backend can claim the recycled PGPROC slot +# without hitting "latch already owned by PID ..." PANIC. +# +# Requires --enable-injection-points. + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +if (($ENV{enable_injection_points} // '') ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node = PostgreSQL::Test::Cluster->new('prockill_race'); +$node->init; +$node->append_conf('postgresql.conf', + q{shared_preload_libraries = 'injection_points'}); +$node->start; + +$node->safe_psql('postgres', + q{CREATE EXTENSION injection_points; + CREATE EXTENSION prockill_race;}); + +# Form a two-backend lock group. +my $leader = $node->background_psql('postgres'); +my $follower = $node->background_psql('postgres'); + +$leader->query_safe('SELECT prockill_become_lock_group_leader()'); +my $leader_pid = $leader->query_safe('SELECT pg_backend_pid()'); +my $follower_pid = $follower->query_safe('SELECT pg_backend_pid()'); +$leader_pid =~ s/\s+//g; +$follower_pid =~ s/\s+//g; + +$follower->query_safe("SELECT prockill_become_lock_group_member($leader_pid)"); + +# Attach PID-scoped injection waits from a throwaway controller session, not +# from $leader/$follower. injection_points_attach() via the victim would +# register a before_shmem_exit cleanup that fires before ProcKill and detaches +# the point mid-exit. InjectionPointAttach() called directly here leaves no +# such hook on the victims. +$node->safe_psql('postgres', qq{ + SELECT prockill_attach_injection_wait('prockill-after-lockgroup-leader', $leader_pid); + SELECT prockill_attach_injection_wait('prockill-after-lockgroup-follower', $follower_pid); +}); + +# Kill the leader and wait for it to pause inside ProcKill. +# prockill_backend_in_injection() reads PGPROC->wait_event_info directly +# because pg_stat_activity and ProcArray are already torn down before ProcKill. +$node->safe_psql('postgres', "SELECT pg_terminate_backend($leader_pid)"); +my $leader_ok = $node->poll_query_until( + 'postgres', + "SELECT prockill_backend_in_injection($leader_pid, 'prockill-after-lockgroup-leader')" +); + +# Kill the follower and wait for it similarly. Use eval: if the fix has +# regressed, the postmaster may have already PANICed. +eval { + $node->safe_psql('postgres', + "SELECT pg_terminate_backend($follower_pid)"); +}; +my $follower_ok = $node->poll_query_until( + 'postgres', + "SELECT prockill_backend_in_injection($follower_pid, 'prockill-after-lockgroup-follower')" +); + +# Release both victims. +eval { + $node->safe_psql('postgres', + q{SELECT injection_points_wakeup('prockill-after-lockgroup-follower'); + SELECT injection_points_wakeup('prockill-after-lockgroup-leader');}); +}; + +# A new backend must be able to claim the recycled PGPROC slot without PANIC. +my $select_ok = eval { $node->safe_psql('postgres', 'SELECT 1'); 1 }; + +ok($leader_ok && $follower_ok, + 'leader and follower reached ProcKill injection points'); +ok($select_ok, + 'new backend claimed recycled PGPROC without latch-recycle PANIC'); + +eval { + $node->safe_psql('postgres', + q{SELECT injection_points_detach('prockill-after-lockgroup-leader'); + SELECT injection_points_detach('prockill-after-lockgroup-follower');}); +}; + +eval { $leader->quit }; +eval { $follower->quit }; +$node->stop('fast', fail_ok => 1); + +done_testing(); -- 2.50.1 (Apple Git-155)