Hi all,
We've seen several cases of hanging queries lately which were impossible to cancel (even with pg_terminate_backend). The only fix we've found was to restart the whole database or kill -9 the process, as the hanging query was preventing vacuum to clean up properly. Both of these lead to all clients disconnecting, which is quite a painful measure.
Unfortunately it occurs rarely and we haven't found a way to reproduce it in a dev environment. However, I do have a stack trace of when it occurred once.
The stack trace can be found below. In pg_stat_statements the function would show up as wait_event=DynamicSharedMemoryControlLock and waiting for a LWLock. We're running Postgres 11.2. So far I've just disabled parallel plans altogether, which will hopefully prevent this from happening, but this is of course not an ideal solution.
Any ideas what may cause this?
ful(gdb) bt full
#0 0x00007fd8095b979b in do_futex_wait.constprop () from /lib64/libpthread.so.0
No symbol table info available.
#1 0x00007fd8095b982f in __new_sem_wait_slow.constprop.0 () from /lib64/libpthread.so.0
No symbol table info available.
#2 0x00007fd8095b98cb in sem_wait@@GLIBC_2.2.5 () from /lib64/libpthread.so.0
No symbol table info available.
#3 0x00000000007335f2 in PGSemaphoreLock (sema=0x7fd3e2d313b8) at pg_sema.c:316
errStatus = <optimized out>
__func__ = "PGSemaphoreLock"
#4 0x00000000007bc30e in LWLockAcquire (lock=0x7fd3e2d34100, mode=mode@entry=LW_EXCLUSIVE) at lwlock.c:1243
proc = 0x7fd7fb584e30
result = <optimized out>
extraWaits = 0
__func__ = "LWLockAcquire"
#5 0x00000000007a3396 in dsm_unpin_segment (handle=1178960337) at dsm.c:955
junk_impl_private = 0x0
junk_mapped_address = 0x0
junk_mapped_size = 0
control_slot = <optimized out>
destroy = true
i = <optimized out>
__func__ = "dsm_unpin_segment"
#6 0x000000000096673d in dsa_release_in_place (place=0x7fd80980b6a0, place@entry=0x7fd80980ceac) at dsa.c:618
handle = <optimized out>
control = 0x7fd80980b6a0
i = 16
#7 dsa_on_dsm_detach_release_in_place (segment=segment@entry=0x12c13a0, place=place@entry=140565849093792) at dsa.c:574
No locals.
#8 0x00000000007a2dff in dsm_detach (seg=0x12c13a0) at dsm.c:733
function = 0x9666f0 <dsa_on_dsm_detach_release_in_place>
arg = 140565849093792
#9 0x0000000000500c87 in DestroyParallelContext (pcxt=0x1a55e78) at parallel.c:900
i = <optimized out>
#10 0x0000000000644981 in ExecParallelCleanup (pei=0x1a4c370) at execParallel.c:1154
No locals.
#11 0x0000000000658622 in ExecShutdownGather (node=node@entry=0x1a174d0) at nodeGather.c:406
No locals.
#12 0x00000000006476e0 in ExecShutdownNode (node=0x1a174d0) at execProcnode.c:767
node = 0x1a174d0
#13 0x0000000000694800 in planstate_tree_walker (planstate=planstate@entry=0x1a17310, walker=walker@entry=0x647650 <ExecShutdownNode>, context=context@entry=0x0) at nodeFuncs.c:3739
plan = 0x2135e10
lc = <optimized out>
#14 0x000000000064766d in ExecShutdownNode (node=0x1a17310) at execProcnode.c:749
node = 0x1a17310
#15 0x0000000000694800 in planstate_tree_walker (planstate=planstate@entry=0x1a17150, walker=walker@entry=0x647650 <ExecShutdownNode>, context=context@entry=0x0) at nodeFuncs.c:3739
plan = 0x2135f60
lc = <optimized out>
#16 0x000000000064766d in ExecShutdownNode (node=0x1a17150) at execProcnode.c:749
node = 0x1a17150
#17 0x0000000000694814 in planstate_tree_walker (planstate=planstate@entry=0x1a15f10, walker=walker@entry=0x647650 <ExecShutdownNode>, context=context@entry=0x0) at nodeFuncs.c:3746
plan = 0x2135ff0
lc = <optimized out>
#18 0x000000000064766d in ExecShutdownNode (node=0x1a15f10) at execProcnode.c:749
node = 0x1a15f10
#19 0x0000000000694800 in planstate_tree_walker (planstate=planstate@entry=0x1a15af8, walker=walker@entry=0x647650 <ExecShutdownNode>, context=context@entry=0x0) at nodeFuncs.c:3739
plan = 0x2136e88
lc = <optimized out>
#20 0x000000000064766d in ExecShutdownNode (node=0x1a15af8) at execProcnode.c:749
node = 0x1a15af8
#21 0x00000000006949b7 in planstate_walk_members (context=0x0, walker=0x647650 <ExecShutdownNode>, nplans=<optimized out>, planstates=<optimized out>) at nodeFuncs.c:3838
j = <optimized out>
#22 planstate_tree_walker (planstate=planstate@entry=0x14c49c0, walker=walker@entry=0x647650 <ExecShutdownNode>, context=context@entry=0x0) at nodeFuncs.c:3760
plan = <optimized out>
lc = <optimized out>
#23 0x000000000064766d in ExecShutdownNode (node=node@entry=0x14c49c0) at execProcnode.c:749
node = 0x14c49c0
#24 0x000000000063fe10 in ExecutePlan (execute_once=<optimized out>, dest=0x229cff8, direction=<optimized out>, numberTuples=0, sendTuples=<optimized out>, operation=CMD_SELECT, use_parallel_mode=<optimized out>, planstate=0x14c49c0, estate=0x1376800) at execMain.c:1787
slot = <optimized out>
current_tuple_count = 122
#25 standard_ExecutorRun (queryDesc=0x1390860, direction=<optimized out>, count=0, execute_once=<optimized out>) at execMain.c:364
estate = 0x1376800
---Type <return> to continue, or q <return> to quit---
operation = CMD_SELECT
dest = 0x229cff8
sendTuples = <optimized out>
__func__ = "standard_ExecutorRun"
#26 0x00000000007cf25d in PortalRunSelect (portal=0x131b510, forward=<optimized out>, count=0, dest=<optimized out>) at pquery.c:932
queryDesc = 0x1390860
direction = <optimized out>
nprocessed = <optimized out>
#27 0x00000000007d0dab in PortalRun (portal=portal@entry=0x131b510, count=count@entry=9223372036854775807, isTopLevel=isTopLevel@entry=true, run_once=run_once@entry=true, dest=dest@entry=0x229cff8, altdest=altdest@entry=0x229cff8, completionTag=0x7fffe3f73670 "") at pquery.c:773
save_exception_stack = 0x7fffe3f738c0
save_context_stack = 0x0
local_sigjmp_buf = {{__jmpbuf = {1, -3355592585487811849, 28030728, 36294648, 20034832, 2, -3355592585429091593, 3355548160413825783}, __mask_was_saved = 0, __saved_mask = {__val = {64, 20046392, 0, 0, 0, 1, 20043040, 11614565, 20514640, 32, 32, 140737018017390, 112, 19252272, 28030728, 11064815}}}}
result = <optimized out>
nprocessed = <optimized out>
saveTopTransactionResourceOwner = 0x12b41d8
saveTopTransactionContext = 0x1a55ad0
saveActivePortal = 0x0
saveResourceOwner = 0x12b41d8
savePortalContext = 0x0
saveMemoryContext = 0x1a55ad0
__func__ = "PortalRun"
#28 0x00000000007cbf68 in exec_simple_query (query_string=0x125cd60 "(select COALESCE(''||portfolio::text,'')||COALESCE(';'||underlying::text,'')||COALESCE(';'||expiry::text,'') as key, portfolio, underlying, expiry, sum(skewdelta) as skewdelta, sum(skewgamma) as skewg"...) at postgres.c:1145
parsetree = <optimized out>
portal = 0x131b510
snapshot_set = true
commandTag = <optimized out>
completionTag = "\000 \000\000\000\000\000\000\006\000\000\000\000\000\000\000`\315%\001\000\000\000\000\006\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000|&t", '\000' <repeats 13 times>, "\a\r\225\000\000\000\000"
receiver = 0x229cff8
format = 0
dest = DestRemote
parsetree_item = 0x1abb738
save_log_statement_stats = false
was_logged = <optimized out>
use_implicit_block = false
msec_str = "\000 \000\000\000\000\000\000\006\000\000\000\000\000\000\000`\315%\001\000\000\000\000\006\000\000\000\000\000\000"
__func__ = "exec_simple_query"
#29 0x00000000007ce021 in PostgresMain (argc=<optimized out>, argv=argv@entry=0x12a2a10, dbname=<optimized out>, username=<optimized out>) at postgres.c:4182
query_string = 0x125cd60 ""...
input_message = {data = 0x125cd60 ""..., len = 1841, maxlen = 2048, cursor = 1841}
local_sigjmp_buf = {{__jmpbuf = {3535296317, 3355548667618687735, 19239616, 3, 7612028, 34, -3355592585504589065, 3355548158726404855}, __mask_was_saved = 1, __saved_mask = {__val = {0, 49, 9907367, 19500656, 19252272, 1024, 19540496, 3, 9905661, 0, 140737018018176, 140737018018288, 6811886,
14776960, 9755147, 19257632}}}}
send_ready_for_query = false
disable_idle_in_transaction_timeout = false
__func__ = "PostgresMain"
#30 0x000000000074296b in BackendRun (port=0x1298e70) at postmaster.c:4361
ac = 1
secs = 606794436
usecs = 605778
i = 1
av = 0x12a2a10
maxac = <optimized out>
#31 BackendStartup (port=0x1298e70) at postmaster.c:4033
bn = <optimized out>
pid = <optimized out>
#32 ServerLoop () at postmaster.c:1706
rmask = {fds_bits = {8, 0 <repeats 15 times>}}
selres = <optimized out>
now = <optimized out>
readmask = {fds_bits = {56, 0 <repeats 15 times>}}
last_lockfile_recheck_time = <optimized out>
last_touch_time = <optimized out>
__func__ = "ServerLoop"
#33 0x00000000007458c4 in PostmasterMain (argc=argc@entry=3, argv=argv@entry=0x12571c0) at postmaster.c:1379
opt = <optimized out>
status = <optimized out>
userDoption = <optimized out>
listen_addr_saved = <optimized out>
i = <optimized out>
output_config_variable = <optimized out>
__func__ = "PostmasterMain"
#34 0x00000000004845b6 in main (argc=3, argv=0x12571c0) at main.c:228
-Floris