From 0ec0beb294f4c5ed35dbb35260f53b069563638f Mon Sep 17 00:00:00 2001 From: Craig Ringer Date: Thu, 19 Nov 2020 14:24:55 +0800 Subject: [PATCH] LWLock self-deadlock detection On cassert builds, if we miss the LWLock acquire fast-path and have to wait for a LWLock, check to make sure that the acquiring backend doesn't already hold the lock. This detects conditions that would otherwise cause a backend to deadlock indefinitely with interrupts disabled. A backend in this state won't respond to SIGTERM or SIGQUIT and will prevent postgres from finishing shutdown. While a LWLock self-deadlock is always the result of a programming mistake, it's not always easy to forsee all possible locking states especially when dealing with exception handling and with exit callbacks. --- src/backend/storage/lmgr/lwlock.c | 87 +++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 2fa90cc095..a2d6dec557 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -313,6 +313,85 @@ LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg) #define LOG_LWDEBUG(a,b,c) ((void)0) #endif /* LOCK_DEBUG */ +#if defined(USE_ASSERT_CHECKING) || defined(LOCK_DEBUG) +#ifndef LWLOCK_SELF_DEADLOCK_DETECTION +#define LWLOCK_SELF_DEADLOCK_DETECTION +#endif +#endif + +#ifdef LWLOCK_SELF_DEADLOCK_DETECTION +static void +LWLockReportSelfDeadlock(LWLock *lock, LWLockMode mode, int held_lockno) +{ + StringInfoData locklist; + int i; + + LOG_LWDEBUG("LWLockReportSelfDeadlock", lock, "acquire self-deadlock detected"); + + initStringInfo(&locklist); + for (i=0; i < num_held_lwlocks; i++) { + appendStringInfo(&locklist, "%s (%p) excl=%d", + T_NAME(held_lwlocks[i].lock), + held_lwlocks[i].lock, + held_lwlocks[i].mode == LW_EXCLUSIVE); + if (i > 0) + appendStringInfoString(&locklist, ", "); + } + + + /* + * It might be safe to bail out here on a non-cassert build but + * more care is needed before anything like that is enabled. + * LWLockAcquire() doesn't know if it's safe to `elog(FATAL)` + * out from the current callsite. So we PANIC. This introduces some + * risk of buggy code causing a server crash/restart cycle where + * it would've previously just deadlocked a single backend, but + * that's part of why we only enable this for assert builds. + */ + ereport(PANIC, + (errmsg_internal("self-deadlock detected on acquire of lwlock %s (%p) for mode %s: lock already held by this backend in mode %s", + T_NAME(lock), lock, + mode == LW_EXCLUSIVE + ? "LW_EXCLUSIVE" : "LW_SHARED", + held_lwlocks[held_lockno].mode == LW_EXCLUSIVE + ? "LW_EXCLUSIVE" : "LW_SHARED"), + errdetail("backend already holds %d lwlocks: %s", + num_held_lwlocks, locklist.data))); +} + +/* + * A bug in code that uses this particular LWLock could cause + * the lock-holder could be ourselves, in which case we'll + * self-deadlock forever as an unkillable process. + * + * Call this in any LWLock acquire retry loop once the fast-path + * acquire has failed. + * + * held_lockno should be initialized to 0 outside the wait loop + * so that this check only runs on the first iteration of any + * wait/retry loop. + */ +inline static void +LWLockCheckSelfDeadlock(LWLock *lock, LWLockMode mode, int * const held_lockno) +{ + for ( ; *held_lockno < num_held_lwlocks; (*held_lockno)++) { + if (unlikely(held_lwlocks[*held_lockno].lock == lock)) { + LWLockReportSelfDeadlock(lock, mode, *held_lockno); + } + } +} +#else +/* + * Making LWLockCheckSelfDeadlock an empty static inline instead of a macro + * silences compiler whinging about held_lockno being unused while optimising + * away without fuss. + */ +inline static void +LWLockCheckSelfDeadlock(LWLock *lock, LWLockMode mode, int * const held_lockno) +{ +} +#endif + #ifdef LWLOCK_STATS static void init_lwlock_stats(void); @@ -1210,6 +1289,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode) PGPROC *proc = MyProc; bool result = true; int extraWaits = 0; + int held_lockno = 0; /* for self-deadlock detection */ #ifdef LWLOCK_STATS lwlock_stats *lwstats; @@ -1322,6 +1402,9 @@ LWLockAcquire(LWLock *lock, LWLockMode mode) lwstats->block_count++; #endif + /* Detect if we're trying to acquire our own lock */ + LWLockCheckSelfDeadlock(lock, mode, &held_lockno); + LWLockReportWaitStart(lock); TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); @@ -1621,6 +1704,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) PGPROC *proc = MyProc; int extraWaits = 0; bool result = false; + int held_lockno = 0; /* for self-deadlock detection */ #ifdef LWLOCK_STATS lwlock_stats *lwstats; @@ -1699,6 +1783,9 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) lwstats->block_count++; #endif + /* Detect if we're trying to wait on our own lock */ + LWLockCheckSelfDeadlock(lock, LW_EXCLUSIVE, &held_lockno); + LWLockReportWaitStart(lock); TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE); -- 2.26.2