From 1514e1e759c25e94a70250f0159b609d9e1f5de1 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Wed, 13 Aug 2025 07:57:07 +0000 Subject: [PATCH v3] Fix DROP SUBSCRIPTION deadlock with new database creation DROP SUBSCRIPTION previously acquired an AccessExclusiveLock on the pg_subscription catalog to prevent the replication launcher from starting a new worker. However, this caused a deadlock. New database creation also need to acquire an AccessShareLock on this same catalog during their initialization phase. This created a lock conflict where DROP SUBSCRIPTION would block this, while simultaneously waiting for a connection to complete for dropping the replication slot. This commit resolves the deadlock by having DROP SUBSCRIPTION acquire a less restrictive AccessShareLock on the catalog instead. To address the original concern of orphaned workers, a new check is implemented. The replication worker now takes a shared object lock on the subscription itself. If a worker starts for a subscription that no longer exists, it immediately detects this condition and exits. This ensures that no orphan workers are created without the need for an overly broad AccessExclusiveLock on the system catalog. --- src/backend/commands/subscriptioncmds.c | 6 +----- src/backend/replication/logical/worker.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 40356e97ed2..ad34e9ed528 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -843,11 +843,7 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) StringInfoData cmd; Form_pg_subscription form; - /* - * Lock pg_subscription with AccessExclusiveLock to ensure that the - * launcher doesn't restart new worker during dropping the subscription - */ - rel = table_open(SubscriptionRelationId, AccessExclusiveLock); + rel = table_open(SubscriptionRelationId, AccessShareLock); tup = SearchSysCache2(SUBSCRIPTIONNAME, MyDatabaseId, CStringGetDatum(stmt->subname)); diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index a0e64301a94..2472f2f1c44 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -2070,6 +2070,12 @@ ApplyWorkerMain(Datum main_arg) StartTransactionCommand(); oldctx = MemoryContextSwitchTo(ApplyContext); + /* + * Lock the subscription to prevent it from being concurrently dropped, + * then re-verify its existence. + */ + LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid, 0, + AccessShareLock); MySubscription = GetSubscription(MyLogicalRepWorker->subid, true); if (!MySubscription) { @@ -2077,9 +2083,18 @@ ApplyWorkerMain(Datum main_arg) (errmsg("logical replication apply worker for subscription %u will not " "start because the subscription was removed during startup", MyLogicalRepWorker->subid))); + + /* + * The shared object lock on subid is automatically released by + * proc_exit(), so no explicit unlock is necessary here. + */ proc_exit(0); } + /* + * The shared object lock on subid will be released at transaction end. + */ + MySubscriptionValid = true; MemoryContextSwitchTo(oldctx); -- 2.51.0.rc0.205.g4a044479a3-goog