From f6bcafdd1587c21ce0d3a04eee624a46f1137b25 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Wed, 13 Aug 2025 08:15:05 +0000 Subject: [PATCH v8] Fix DROP SUBSCRIPTION deadlock with walsender process The DROP SUBSCRIPTION command caused a deadlock with the walsender process because of a lock conflict on the pg_subscription catalog. This happened when the command was trying to drop a replication slot by connecting to a newly created database with uninitialized caches. The DROP SUBSCRIPTION command would first acquire an AccessExclusiveLock on pg_subscription to prevent new workers from starting. However, the connection to the new database needed to acquire an AccessShareLock on the same catalog during its initialization. This created a deadlock where DROP SUBSCRIPTION held its lock and was waiting for the new connection to complete, while the new connection was simultaneously blocked from starting because it couldn't acquire its required lock. This commit resolves the deadlock by having DROP SUBSCRIPTION acquire a less restrictive RowExclusiveLock on the catalog instead. To address the original concern of orphaned workers, a new check is implemented. The replication worker now takes a shared object lock on the subscription itself. If a worker starts for a subscription that no longer exists, it immediately detects this condition and exits. This ensures that no orphan workers are created without the need for an overly broad AccessExclusiveLock on the system catalog. --- src/backend/commands/subscriptioncmds.c | 8 ++++--- src/backend/replication/logical/worker.c | 7 ++++++ src/test/subscription/t/100_bugs.pl | 30 ++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index faa3650d287..4c01d21b2f3 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -1803,10 +1803,12 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) bool must_use_password; /* - * Lock pg_subscription with AccessExclusiveLock to ensure that the - * launcher doesn't restart new worker during dropping the subscription + * The launcher may concurrently start a new worker for this subscription. + * During initialization, the worker checks for subscription validity and + * exits if the subscription has already been dropped. See + * InitializeLogRepWorker. */ - rel = table_open(SubscriptionRelationId, AccessExclusiveLock); + rel = table_open(SubscriptionRelationId, RowExclusiveLock); tup = SearchSysCache2(SUBSCRIPTIONNAME, ObjectIdGetDatum(MyDatabaseId), CStringGetDatum(stmt->subname)); diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 8e343873454..22ad9051db3 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -5415,6 +5415,13 @@ InitializeLogRepWorker(void) StartTransactionCommand(); oldctx = MemoryContextSwitchTo(ApplyContext); + /* + * Lock the subscription to prevent it from being concurrently dropped, + * then re-verify its existence. After the initialization, the worker will + * be terminated gracefully if the subscription is dropped. + */ + LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid, 0, + AccessShareLock); MySubscription = GetSubscription(MyLogicalRepWorker->subid, true); if (!MySubscription) { diff --git a/src/test/subscription/t/100_bugs.pl b/src/test/subscription/t/100_bugs.pl index 5e357701183..50223054918 100644 --- a/src/test/subscription/t/100_bugs.pl +++ b/src/test/subscription/t/100_bugs.pl @@ -575,4 +575,34 @@ is($result, 't', $node_publisher->stop('fast'); $node_subscriber->stop('fast'); +# BUG #18988 +# The bug happened due to a self-deadlock between the DROP SUBSCRIPTION +# command and the walsender process for accessing pg_subscription. This +# occurred when DROP SUBSCRIPTION attempted to remove a replication slot by +# connecting to a newly created database whose caches are not yet +# initialized. +# +# The bug is fixed by reducing the lock-level during DROP SUBSCRIPTION. +$node_publisher->start(); + +$publisher_connstr = $node_publisher->connstr . ' dbname=regress_db'; +$node_publisher->safe_psql( + 'postgres', qq( + CREATE DATABASE regress_db; + CREATE SUBSCRIPTION regress_sub1 CONNECTION '$publisher_connstr' PUBLICATION regress_pub WITH (connect=false); +)); + +my ($ret, $stdout, $stderr) = + $node_publisher->psql('postgres', q{DROP SUBSCRIPTION regress_sub1}); + +isnt($ret, 0, "replication slot does not exist: exit code not 0"); +like( + $stderr, + qr/ERROR: could not drop replication slot "regress_sub1" on publisher/, + "could not drop replication slot: error message"); + +$node_publisher->safe_psql('postgres', "DROP DATABASE regress_db"); + +$node_publisher->stop('fast'); + done_testing(); -- 2.51.0.rc1.167.g924127e9c0-goog