Re: LWLock deadlock and gdb advice - Mailing list pgsql-hackers
From | Jeff Janes |
---|---|
Subject | Re: LWLock deadlock and gdb advice |
Date | |
Msg-id | CAMkU=1ySU19ravj--u9AizSPqSCnFZb8wtLJhQXJpqZ_dsNBgA@mail.gmail.com Whole thread Raw |
In response to | Re: LWLock deadlock and gdb advice (Jeff Janes <jeff.janes@gmail.com>) |
Responses |
Re: LWLock deadlock and gdb advice
Re: LWLock deadlock and gdb advice |
List | pgsql-hackers |
On Mon, Jun 29, 2015 at 11:28 PM, Jeff Janes <jeff.janes@gmail.com> wrote:
On Mon, Jun 29, 2015 at 5:55 PM, Peter Geoghegan <pg@heroku.com> wrote:On Mon, Jun 29, 2015 at 5:37 PM, Jeff Janes <jeff.janes@gmail.com> wrote:
> Is there a way to use gdb to figure out who holds the lock they are waiting
> for?
Have you considered building with LWLOCK_STATS defined, and LOCK_DEBUG
defined? That might do it.I hadn't planned on running into this problem, so had not compiled accordingly.I thought LOCK_DEBUG would produce too much output, but now I see it doesn't print anything unless Trace_lwlocks is on (but it still makes more info available to gdb, as Amit mentioned), so I think that should be OK.Since I messed up the gdb session causing the postmaster to SIGKILL all the children and destroy the evidence, I'll repeat the run compiled with LOCK_DEBUG and see what it looks like in the morning.
I've gotten the LWLock deadlock again. User backend 24841 holds the WALInsertLocks 7 and is blocked attempting to acquire 6 . So it seems to be violating the lock ordering rules (although I don't see that rule spelled out in xlog.c)
The Checkpoint process, meanwhile, has acquired WALInsertLocks 0 through 6 and is blocked on 7.
I'm not sure where to go from here.
24841:
(gdb) f 2
(gdb) p held_lwlocks
$1 = {{lock = 0x7f70e11fae40, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13df080, mode = LW_EXCLUSIVE}, {lock = 0x7f70e11d4280, mode = LW_EXCLUSIVE}, {lock = 0x7f70e11d4280,
mode = LW_EXCLUSIVE}, {lock = 0x0, mode = LW_EXCLUSIVE} <repeats 196 times>}
(gdb) p T_NAME((LWLock *) held_lwlocks[1])
$2 = 0x84cee1 "WALInsertLocks"
(gdb) p T_ID((LWLock *) held_lwlocks[1])
(gdb) p lock
$3 = (LWLock *) 0x7f70e13df000
(gdb) p T_NAME((LWLock *) lock)
$7 = 0x84cee1 "WALInsertLocks"
(gdb) p T_ID((LWLock *) lock)
$8 = 6
(gdb) p *(lock)
$1 = {mutex = 0 '\000', tranche = 1, state = {value = 1627389952}, nwaiters = {value = 2}, waiters = {head = {prev = 0x7f70e9e29e58, next = 0x7f70e9e2a140}},
owner = 0x7f70e9e2d260}
(gdb) p *(lock->owner)
$5 = {links = {prev = 0x0, next = 0x0}, sem = {semId = 539754537, semNum = 0}, waitStatus = 0, procLatch = {is_set = 0, is_shared = 1 '\001', owner_pid = 24820}, lxid = 0,
pid = 24820, pgprocno = 112, backendId = -1, databaseId = 0, roleId = 0, recoveryConflictPending = 0 '\000', lwWaiting = 1 '\001', lwWaitMode = 0 '\000', lwWaitLink = {
prev = 0x7f70e13df090, next = 0x7f70e13df090}, waitLock = 0x0, waitProcLock = 0x0, waitLockMode = 0, heldLocks = 0, waitLSN = 0, syncRepState = 0, syncRepLinks = {prev = 0x0,
next = 0x0}, myProcLocks = {{prev = 0x7f70e9e2d2f0, next = 0x7f70e9e2d2f0}, {prev = 0x7f70e9e2d300, next = 0x7f70e9e2d300}, {prev = 0x7f70e9e2d310, next = 0x7f70e9e2d310}, {
prev = 0x7f70e9e2d320, next = 0x7f70e9e2d320}, {prev = 0x7f70e9e2d330, next = 0x7f70e9e2d330}, {prev = 0x7f70e9e2d340, next = 0x7f70e9e2d340}, {prev = 0x7f70e9e2d350,
next = 0x7f70e9e2d350}, {prev = 0x7f70e9e2d360, next = 0x7f70e9e2d360}, {prev = 0x7f70e9e2d370, next = 0x7f70e9e2d370}, {prev = 0x7f70e9e2d380, next = 0x7f70e9e2d380}, {
prev = 0x7f70e9e2d390, next = 0x7f70e9e2d390}, {prev = 0x7f70e9e2d3a0, next = 0x7f70e9e2d3a0}, {prev = 0x7f70e9e2d3b0, next = 0x7f70e9e2d3b0}, {prev = 0x7f70e9e2d3c0,
next = 0x7f70e9e2d3c0}, {prev = 0x7f70e9e2d3d0, next = 0x7f70e9e2d3d0}, {prev = 0x7f70e9e2d3e0, next = 0x7f70e9e2d3e0}}, subxids = {xids = {0 <repeats 64 times>}},
backendLock = 0x7f70e13dad00, fpLockBits = 0, fpRelId = {0 <repeats 16 times>}, fpVXIDLock = 0 '\000', fpLocalTransactionId = 0}
#0 0x0000003dcb6eaf27 in semop () from /lib64/libc.so.6
#1 0x0000000000671aef in PGSemaphoreLock (sema=0x7f70e9e2a108) at pg_sema.c:387
#2 0x00000000006d79ac in LWLockWaitForVar (lock=0x7f70e13df000, valptr=0x7f70e13df028, oldval=0, newval=0x7fffb33cbd48) at lwlock.c:1406
#3 0x00000000004ee006 in WaitXLogInsertionsToFinish (upto=41590734848) at xlog.c:1576
#4 0x00000000004ee94b in AdvanceXLInsertBuffer (upto=41594920960, opportunistic=<value optimized out>) at xlog.c:1888
#5 0x00000000004f3c42 in GetXLogBuffer (ptr=41594920960) at xlog.c:1669
#6 0x00000000004f40e7 in CopyXLogRecordToWAL (rdata=0x19bc920, fpw_lsn=<value optimized out>) at xlog.c:1313
#7 XLogInsertRecord (rdata=0x19bc920, fpw_lsn=<value optimized out>) at xlog.c:1008
#8 0x00000000004f7c6c in XLogInsert (rmid=13 '\r', info=32 ' ') at xloginsert.c:453
#9 0x000000000047e210 in ginPlaceToPage (btree=0x7fffb33cc070, stack=0x1a5bfe0, insertdata=<value optimized out>, updateblkno=<value optimized out>, childbuf=0, buildStats=0x0)
at ginbtree.c:418
#10 0x000000000047f50d in ginInsertValue (btree=0x7fffb33cc070, stack=0x1a5bfe0, insertdata=0x7fffb33cc0f0, buildStats=0x0) at ginbtree.c:748
#11 0x0000000000475deb in ginEntryInsert (ginstate=0x7fffb33cc470, attnum=5544, key=1, category=<value optimized out>, items=0x7f70e10d9140, nitem=51, buildStats=0x0)
at gininsert.c:234
#12 0x000000000048602c in ginInsertCleanup (ginstate=0x7fffb33cc470, vac_delay=<value optimized out>, stats=0x0) at ginfast.c:843
#13 0x00000000004871b9 in ginHeapTupleFastInsert (ginstate=0x7fffb33cc470, collector=<value optimized out>) at ginfast.c:436
#14 0x000000000047625a in gininsert (fcinfo=<value optimized out>) at gininsert.c:531
#15 0x00000000007dc483 in FunctionCall6Coll (flinfo=<value optimized out>, collation=<value optimized out>, arg1=<value optimized out>, arg2=<value optimized out>,
arg3=<value optimized out>, arg4=<value optimized out>, arg5=140122789534360, arg6=0) at fmgr.c:1436
#16 0x00000000004b2247 in index_insert (indexRelation=0x7f70e1190e60, values=0x7fffb33cef50, isnull=0x7fffb33cf050 "", heap_t_ctid=0x1a4a794, heapRelation=0x7f70e1185a98,
checkUnique=UNIQUE_CHECK_NO) at indexam.c:226
#17 0x00000000005d2e67 in ExecInsertIndexTuples (slot=0x1a497f0, tupleid=0x1a4a794, estate=0x1a40768, noDupErr=0 '\000', specConflict=0x0, arbiterIndexes=0x0)
at execIndexing.c:384
#18 0x00000000005f0161 in ExecUpdate (tupleid=0x7fffb33cf250, oldtuple=0x0, slot=0x1a497f0, planSlot=0x1a42498, epqstate=0x1a40a70, estate=0x1a40768, canSetTag=1 '\001')
at nodeModifyTable.c:978
#19 0x00000000005f0b2a in ExecModifyTable (node=0x1a409d0) at nodeModifyTable.c:1436
#20 0x00000000005d6cc8 in ExecProcNode (node=0x1a409d0) at execProcnode.c:389
#21 0x00000000005d5402 in ExecutePlan (queryDesc=0x1a2b220, direction=<value optimized out>, count=0) at execMain.c:1549
#22 standard_ExecutorRun (queryDesc=0x1a2b220, direction=<value optimized out>, count=0) at execMain.c:337
#23 0x00007f70ea39af3b in pgss_ExecutorRun (queryDesc=0x1a2b220, direction=ForwardScanDirection, count=0) at pg_stat_statements.c:881
#24 0x00000000006ea38f in ProcessQuery (plan=0x1a2b3d8, sourceText=0x1a2af68 "update foo set count=count+1 where text_array @> $1", params=0x1a2afe8, dest=<value optimized out>,
completionTag=0x7fffb33cf660 "") at pquery.c:185
#25 0x00000000006ea5ec in PortalRunMulti (portal=0x19a6128, isTopLevel=1 '\001', dest=0xc55020, altdest=0xc55020, completionTag=0x7fffb33cf660 "") at pquery.c:1279
#26 0x00000000006eadb3 in PortalRun (portal=0x19a6128, count=9223372036854775807, isTopLevel=1 '\001', dest=0x1990630, altdest=0x1990630, completionTag=0x7fffb33cf660 "")
at pquery.c:816
#27 0x00000000006e6ffb in exec_execute_message (portal_name=0x1990218 "", max_rows=9223372036854775807) at postgres.c:1988
#28 0x00000000006e8c15 in PostgresMain (argc=<value optimized out>, argv=<value optimized out>, dbname=0x19a3210 "jjanes", username=<value optimized out>) at postgres.c:4088
#29 0x00000000006855dd in BackendRun (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:4159
#30 BackendStartup (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:3835
#31 ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1609
#32 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1254
#33 0x0000000000610ab0 in main (argc=4, argv=0x1976cf0) at main.c:221
24820:
(gdb) p held_lwlocks
$1 = {{lock = 0x7f70e11d4340, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13ded00, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13ded80, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13dee00,
mode = LW_EXCLUSIVE}, {lock = 0x7f70e13dee80, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13def00, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13def80, mode = LW_EXCLUSIVE}, {
lock = 0x7f70e13df000, mode = LW_EXCLUSIVE}, {lock = 0x0, mode = LW_EXCLUSIVE} <repeats 192 times>}
(gdb) p lock
$3 = (LWLock *) 0x7f70e13df080
(gdb) p *lock
$4 = {mutex = 0 '\000', tranche = 1, state = {value = 1627389952}, nwaiters = {value = 1}, waiters = {head = {prev = 0x7f70e9e2d2a8, next = 0x7f70e9e2d2a8}},
owner = 0x7f70e9e2a0f8}
(gdb) p *(lock->owner)
$5 = {links = {prev = 0x0, next = 0x0}, sem = {semId = 539688999, semNum = 15}, waitStatus = 0, procLatch = {is_set = 1, is_shared = 1 '\001', owner_pid = 24841}, lxid = 5523,
pid = 24841, pgprocno = 95, backendId = 9, databaseId = 16384, roleId = 10, recoveryConflictPending = 0 '\000', lwWaiting = 1 '\001', lwWaitMode = 2 '\002', lwWaitLink = {
prev = 0x7f70e13df010, next = 0x7f70e9e29e58}, waitLock = 0x0, waitProcLock = 0x0, waitLockMode = 0, heldLocks = 0, waitLSN = 0, syncRepState = 0, syncRepLinks = {prev = 0x0,
next = 0x0}, myProcLocks = {{prev = 0x7f70e9e2a188, next = 0x7f70e9e2a188}, {prev = 0x7f70e9e2a198, next = 0x7f70e9e2a198}, {prev = 0x7f70e9e2a1a8, next = 0x7f70e9e2a1a8}, {
prev = 0x7f70e9bfe060, next = 0x7f70e9bfe060}, {prev = 0x7f70e9e2a1c8, next = 0x7f70e9e2a1c8}, {prev = 0x7f70e9e2a1d8, next = 0x7f70e9e2a1d8}, {prev = 0x7f70e9e2a1e8,
next = 0x7f70e9e2a1e8}, {prev = 0x7f70e9e2a1f8, next = 0x7f70e9e2a1f8}, {prev = 0x7f70e9e2a208, next = 0x7f70e9e2a208}, {prev = 0x7f70e9e2a218, next = 0x7f70e9e2a218}, {
prev = 0x7f70e9e2a228, next = 0x7f70e9e2a228}, {prev = 0x7f70e9e2a238, next = 0x7f70e9e2a238}, {prev = 0x7f70e9e2a248, next = 0x7f70e9e2a248}, {prev = 0x7f70e9e2a258,
next = 0x7f70e9e2a258}, {prev = 0x7f70e9e2a268, next = 0x7f70e9e2a268}, {prev = 0x7f70e9e2a278, next = 0x7f70e9e2a278}}, subxids = {xids = {0 <repeats 64 times>}},
backendLock = 0x7f70e13da8c0, fpLockBits = 160528697655296, fpRelId = {0, 0, 0, 0, 0, 0, 0, 0, 0, 2679, 2610, 16414, 16412, 16416, 16415, 16409}, fpVXIDLock = 1 '\001',
fpLocalTransactionId = 5523}
(gdb) bt
#0 0x0000003dcb6eaf27 in semop () from /lib64/libc.so.6
#1 0x0000000000671aef in PGSemaphoreLock (sema=0x7f70e9e2d270) at pg_sema.c:387
#2 0x00000000006d70ac in LWLockAcquireCommon (lock=0x7f70e13df080, mode=LW_EXCLUSIVE, valptr=0x7f70e13df0a8, val=0) at lwlock.c:1042
#3 0x00000000004e6dab in WALInsertLockAcquireExclusive () at xlog.c:1449
#4 0x00000000004ef025 in CreateCheckPoint (flags=64) at xlog.c:8177
#5 0x00000000006789d3 in CheckpointerMain () at checkpointer.c:493
#6 0x00000000004fecec in AuxiliaryProcessMain (argc=2, argv=0x7fffb33cf1d0) at bootstrap.c:428
#7 0x0000000000680697 in StartChildProcess (type=CheckpointerProcess) at postmaster.c:5143
#8 0x0000000000682b7a in reaper (postgres_signal_arg=<value optimized out>) at postmaster.c:2656
#9 <signal handler called>
#10 0x0000003dcb6e1353 in __select_nocancel () from /lib64/libc.so.6
#11 0x00000000006849bf in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1573
#12 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1254
#13 0x0000000000610ab0 in main (argc=4, argv=0x1976cf0) at main.c:221
Thanks,Jeff
pgsql-hackers by date: