From 493fbbf060154fb3e50744aa58e260d44dd20fd0 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Mon, 18 Aug 2025 12:46:30 +0000 Subject: [PATCH v8] Fix DROP SUBSCRIPTION deadlock with walsender process The DROP SUBSCRIPTION command caused a deadlock with the walsender process because of a lock conflict on the pg_subscription catalog. This happened when the command was trying to drop a replication slot by connecting to a newly created database with uninitialized caches. The DROP SUBSCRIPTION command would first acquire an AccessExclusiveLock on pg_subscription to prevent new workers from starting. However, the connection to the new database needed to acquire an AccessShareLock on the same catalog during its initialization. This created a deadlock where DROP SUBSCRIPTION held its lock and was waiting for the new connection to complete, while the new connection was simultaneously blocked from starting because it couldn't acquire its required lock. This commit resolves the deadlock by having DROP SUBSCRIPTION acquire a less restrictive RowExclusiveLock on the catalog instead. To address the original concern of orphaned workers, a new check is implemented. The replication worker now takes a shared object lock on the subscription itself. If a worker starts for a subscription that no longer exists, it immediately detects this condition and exits. This ensures that no orphan workers are created without the need for an overly broad AccessExclusiveLock on the system catalog. --- src/backend/commands/subscriptioncmds.c | 8 +++--- src/backend/replication/logical/worker.c | 7 ++++++ src/test/subscription/t/100_bugs.pl | 32 +++++++++++++++++++++++- 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index b1a2f3f81a2..c09ddbc7a22 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -1070,10 +1070,12 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) List *rstates; /* - * Lock pg_subscription with AccessExclusiveLock to ensure that the - * launcher doesn't restart new worker during dropping the subscription + * The launcher may concurrently start a new worker for this subscription. + * During initialization, the worker checks for subscription validity and + * exits if the subscription has already been dropped. See + * InitializeLogRepWorker. */ - rel = table_open(SubscriptionRelationId, AccessExclusiveLock); + rel = table_open(SubscriptionRelationId, RowExclusiveLock); tup = SearchSysCache2(SUBSCRIPTIONNAME, MyDatabaseId, CStringGetDatum(stmt->subname)); diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index d1c7d352363..93a7714922e 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -3129,6 +3129,13 @@ ApplyWorkerMain(Datum main_arg) StartTransactionCommand(); oldctx = MemoryContextSwitchTo(ApplyContext); + /* + * Lock the subscription to prevent it from being concurrently dropped, + * then re-verify its existence. After the initialization, the worker will + * be terminated gracefully if the subscription is dropped. + */ + LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid, 0, + AccessShareLock); MySubscription = GetSubscription(MyLogicalRepWorker->subid, true); if (!MySubscription) { diff --git a/src/test/subscription/t/100_bugs.pl b/src/test/subscription/t/100_bugs.pl index cce91891ab9..235227c7727 100644 --- a/src/test/subscription/t/100_bugs.pl +++ b/src/test/subscription/t/100_bugs.pl @@ -6,7 +6,7 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 9; +use Test::More tests => 11; # Bug #15114 @@ -362,3 +362,33 @@ is( $node_subscriber_d_cols->safe_psql( $node_publisher_d_cols->stop('fast'); $node_subscriber_d_cols->stop('fast'); + +# BUG #18988 +# The bug happened due to a self-deadlock between the DROP SUBSCRIPTION +# command and the walsender process for accessing pg_subscription. This +# occurred when DROP SUBSCRIPTION attempted to remove a replication slot by +# connecting to a newly created database whose caches are not yet +# initialized. +# +# The bug is fixed by reducing the lock-level during DROP SUBSCRIPTION. +$node_publisher->start(); + +$publisher_connstr = $node_publisher->connstr . ' dbname=regress_db'; +$node_publisher->safe_psql( + 'postgres', qq( + CREATE DATABASE regress_db; + CREATE SUBSCRIPTION regress_sub1 CONNECTION '$publisher_connstr' PUBLICATION regress_pub WITH (connect=false); +)); + +my ($ret, $stdout, $stderr) = + $node_publisher->psql('postgres', q{DROP SUBSCRIPTION regress_sub1}); + +isnt($ret, 0, "replication slot does not exist: exit code not 0"); +like( + $stderr, + qr/ERROR: could not drop replication slot "regress_sub1" on publisher/, + "could not drop replication slot: error message"); + +$node_publisher->safe_psql('postgres', "DROP DATABASE regress_db"); + +$node_publisher->stop('fast'); -- 2.51.0.rc1.167.g924127e9c0-goog