pg15b4: FailedAssertion("TransactionIdIsValid(xmax) - Mailing list pgsql-hackers

From Justin Pryzby
Subject pg15b4: FailedAssertion("TransactionIdIsValid(xmax)
Date
Msg-id 20220910020637.GR31833@telsasoft.com
Whole thread Raw
Responses Re: pg15b4: FailedAssertion("TransactionIdIsValid(xmax)
Re: pg15b4: FailedAssertion("TransactionIdIsValid(xmax)
List pgsql-hackers
The sequence of events leading up to this:

0) Yesterday I upgraded an internal VM to pg15b4 using PGDG RPMs;
   It's the same VM that hit the prefetch_recovery bug which was fixed by
   adb466150.  I don't think that should've left it in a weird state
   (since recovery was sucessful when prefetch was disabled, and the
   cluser worked fine until now).
1) This evening, I started running sqlsmith connected to the postgres DB
   that has some logging tables in it;
2) There was a lot of swapping, and the backend was finally killed due
   to OOM when sqlsmith tried to run a terrible query involving
   database_to_xml();
3) lots of backends started crashing with SIGABRT;
4) I was simultaneously compiling pg14b4 to run with with
   -DRELCACHE_FORCE_RELEASE and installing it into /usr/local.  I don't *think*
   running libraries would've been overwritten, and that shouldn't have
   affected the running instance anyway...

I got a lot (100+) of SIGABRT.  I suspect all the backtraces are the same.  My
hypothesis is that the OOM crash caused bad xmax to be written (or rather,
recovery didn't cause it to be written correctly?).  We may have hit a race
condition due to heavy swapping.

(gdb) bt
#0  0x00007fb8a22f31f7 in raise () from /lib64/libc.so.6
#1  0x00007fb8a22f48e8 in abort () from /lib64/libc.so.6
#2  0x000000000098f9be in ExceptionalCondition (conditionName=conditionName@entry=0x9fada4
"TransactionIdIsValid(xmax)",errorType=errorType@entry=0x9ed217 "FailedAssertion", 
 
    fileName=fileName@entry=0x9fad90 "heapam_visibility.c", lineNumber=lineNumber@entry=1353) at assert.c:69
#3  0x00000000004fd4d7 in HeapTupleSatisfiesVacuumHorizon (htup=htup@entry=0x7ffc225a87e0, buffer=buffer@entry=5100,
dead_after=dead_after@entry=0x7ffc225a87d0)at heapam_visibility.c:1353
 
#4  0x0000000000501702 in heap_prune_satisfies_vacuum (buffer=5100, tup=0x7ffc225a87e0, prstate=0x7ffc225a8a50) at
pruneheap.c:504
#5  heap_page_prune (relation=relation@entry=0x7fb8a50c3438, buffer=buffer@entry=5100, vistest=vistest@entry=0xec7890
<GlobalVisDataRels>,old_snap_xmin=<optimized out>, old_snap_ts=<optimized out>, 
 
    nnewlpdead=nnewlpdead@entry=0x7ffc225a964c, off_loc=off_loc@entry=0x0) at pruneheap.c:351
#6  0x0000000000502326 in heap_page_prune_opt (relation=0x7fb8a50c3438, buffer=buffer@entry=5100) at pruneheap.c:209
#7  0x00000000004f3ae3 in heapgetpage (sscan=sscan@entry=0x199b1d0, page=page@entry=2892) at heapam.c:415
#8  0x00000000004f44c2 in heapgettup_pagemode (scan=scan@entry=0x199b1d0, dir=<optimized out>, nkeys=0, key=0x0) at
heapam.c:1120
#9  0x00000000004f5abe in heap_getnextslot (sscan=0x199b1d0, direction=<optimized out>, slot=0x1967be8) at
heapam.c:1352
#10 0x00000000006de16b in table_scan_getnextslot (slot=0x1967be8, direction=ForwardScanDirection, sscan=<optimized
out>)at ../../../src/include/access/tableam.h:1046
 
#11 SeqNext (node=node@entry=0x1967a38) at nodeSeqscan.c:80
#12 0x00000000006b109a in ExecScanFetch (recheckMtd=0x6de0f0 <SeqRecheck>, accessMtd=0x6de100 <SeqNext>,
node=0x1967a38)at execScan.c:133
 
#13 ExecScan (node=0x1967a38, accessMtd=0x6de100 <SeqNext>, recheckMtd=0x6de0f0 <SeqRecheck>) at execScan.c:199
#14 0x00000000006add88 in ExecProcNodeInstr (node=0x1967a38) at execProcnode.c:479
#15 0x00000000006a6182 in ExecProcNode (node=0x1967a38) at ../../../src/include/executor/executor.h:259
#16 ExecutePlan (execute_once=<optimized out>, dest=0x1988448, direction=<optimized out>, numberTuples=0,
sendTuples=true,operation=CMD_SELECT, use_parallel_mode=<optimized out>, planstate=0x1967a38, estate=0x19677a0)
 
    at execMain.c:1636
#17 standard_ExecutorRun (queryDesc=0x1996e80, direction=<optimized out>, count=0, execute_once=<optimized out>) at
execMain.c:363
#18 0x00007fb8960913bd in pgss_ExecutorRun (queryDesc=0x1996e80, direction=ForwardScanDirection, count=0,
execute_once=<optimizedout>) at pg_stat_statements.c:1010
 
#19 0x00007fb895c6f781 in explain_ExecutorRun (queryDesc=0x1996e80, direction=ForwardScanDirection, count=0,
execute_once=<optimizedout>) at auto_explain.c:320
 
#20 0x000000000084976e in PortalRunSelect (portal=portal@entry=0x18fed30, forward=forward@entry=true, count=0,
count@entry=9223372036854775807,dest=dest@entry=0x1988448) at pquery.c:924
 
#21 0x000000000084af4f in PortalRun (portal=0x18fed30, count=9223372036854775807, isTopLevel=<optimized out>,
run_once=<optimizedout>, dest=0x1988448, altdest=0x1988448, qc=0x7ffc225a9ce0) at pquery.c:768
 
#22 0x000000000084679b in exec_simple_query (query_string=0x186d8a0 "SELECT alarm_id, alarm_disregard FROM alarms WHERE
alarm_ack_timeIS NULL AND alarm_clear_time IS NULL AND alarm_office = 'ETHERNET'") at postgres.c:1250
 
#23 0x000000000084848a in PostgresMain (dbname=<optimized out>, username=<optimized out>) at postgres.c:4581
#24 0x0000000000495afe in BackendRun (port=<optimized out>, port=<optimized out>) at postmaster.c:4504
#25 BackendStartup (port=0x1894250) at postmaster.c:4232
#26 ServerLoop () at postmaster.c:1806
#27 0x00000000007b0c60 in PostmasterMain (argc=argc@entry=3, argv=argv@entry=0x1868280) at postmaster.c:1478
#28 0x00000000004976a6 in main (argc=3, argv=0x1868280) at main.c:202


< 2022-09-09 19:44:03.329 CDT  >LOG:  server process (PID 8949) was terminated by signal 6: Aborted
< 2022-09-09 19:44:03.329 CDT  >DETAIL:  Failed process was running: SELECT alarm_id, alarm_disregard FROM alarms
                              WHERE alarm_shakeout_time<=now()
                                    AND alarm_shakeout_time>now()-$1::interval
                                    AND alarm_time!=alarm_shakeout_time
                                    AND alarm_clear_time IS NULL

For context, that's a partition the main table on this DB, and any
problem there would've been immediately apparent.  We have multiple
instances of that query which run continuously.

I saved a copy of the data dir, but I'm going to have to bring the DB back
online soon.  I don't know if I can get a page image easily since GDB is a bit
busted (won't show source code) and I can't remember how I fixed that last
time...

(gdb) fr 6
#6  0x0000000000502326 in heap_page_prune_opt (relation=0x7fb8a50c3438, buffer=buffer@entry=5100) at pruneheap.c:209
209     pruneheap.c: No such file or directory.

(gdb) p *vistest
$2 = {definitely_needed = {value = 8074911083}, maybe_needed = {value = 8074911083}}


#2  0x000000000098f9be in ExceptionalCondition (conditionName=conditionName@entry=0x9fada4
"TransactionIdIsValid(xmax)",errorType=errorType@entry=0x9ed217 "FailedAssertion",
 
    fileName=fileName@entry=0x9fad90 "heapam_visibility.c", lineNumber=lineNumber@entry=1353) at assert.c:69
No locals.
#3  0x00000000004fd4d7 in HeapTupleSatisfiesVacuumHorizon (htup=htup@entry=0x7ffc225a87e0, buffer=buffer@entry=5100,
dead_after=dead_after@entry=0x7ffc225a87d0)at heapam_visibility.c:1353
 
        xmax = 0
        tuple = 0x2aaab5a5fdf8
#4  0x0000000000501702 in heap_prune_satisfies_vacuum (buffer=5100, tup=0x7ffc225a87e0, prstate=0x7ffc225a8a50) at
pruneheap.c:504
        res = <optimized out>
        dead_after = 0
#5  heap_page_prune (relation=relation@entry=0x7fb8a50c3438, buffer=buffer@entry=5100, vistest=vistest@entry=0xec7890
<GlobalVisDataRels>,old_snap_xmin=<optimized out>, old_snap_ts=<optimized out>,
 
    nnewlpdead=nnewlpdead@entry=0x7ffc225a964c, off_loc=off_loc@entry=0x0) at pruneheap.c:351
        itemid = 0x2aaab5a5fd38
        htup = <optimized out>
        ndeleted = 0
        page = 0x2aaab5a5fd00 <Address 0x2aaab5a5fd00 out of bounds>
        blockno = 2892
        offnum = 9
        maxoff = 10
        prstate = {rel = 0x7fb8a50c3438, vistest = 0xec7890 <GlobalVisDataRels>, old_snap_ts = 0, old_snap_xmin = 0,
old_snap_used= false, new_prune_xid = 0, latestRemovedXid = 0, nredirected = 0, ndead = 0, nunused = 0,
 
          redirected = {32696, 0, 22724, 151, 0, 0, 1114, 0, 0, 0, 434, 0, 0, 0, 434, 0, 0, 0, 1114, 0, 0, 0, 1114, 0,
0,0, 1, 0, 0, 0, 0, 0, 1, 3, 0, 0, 950, 0, 6960, 144, 0, 0, 184, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 52144,
 
            397, 0, 0, 0, 0, 0, 0, 434, 0, 0, 0, 0, 0, 2, 3, 0, 0, 950, 0, 6960, 144, 0, 0, 184, 0, 2, 1, 2, 0, 0, 0,
0,0, 0, 0, 52144, 397, 0, 0, 0, 0, 0, 0, 1114, 0, 0, 0, 0, 0, 3, 3, 0, 0, 950, 0, 6960, 144, 0, 0, 184, 0, 
            2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 52144, 397, 0, 0, 0, 0, 0, 0, 1114, 0, 0, 0, 0, 0, 4, 3, 0, 0, 950, 0, 51440,
139,0, 0, 63, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 52144, 397, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
 
            5120, 60453, 16693, 62684, 0, 0, 0, 0, 60168, 397, 0, 0, 1304, 408, 0, 0, 29816, 7947, 0, 0, 3472, 408...},
nowdead= {63057, 4137, 0, 0, 64763, 42226, 32696, 0, 26544, 121, 0, 0, 36704, 8794, 32764, 0, 27788,
 
            41847, 32696, 0, 32664, 41847, 32696, 0, 36976, 8794, 32764, 0, 36960, 8794, 32764, 0, 17, 0, 32764, 0, 0,
0,0, 0, 0, 0, 0, 0, 19304, 42256, 32696, 0, 54472, 42260, 32696, 0, 14860, 69, 0, 0, 36432, 41847, 32696,
 
            0, 28184, 65, 0, 0, 0, 0, 1, 0, 177, 0, 1, 0, 11240, 408, 0, 0, 37160, 8794, 32764, 0, 37120, 8794, 32764,
0,1, 0, 0, 0, 19304, 42256, 32696, 0, 38056, 42260, 32696, 0, 37200, 42260, 32696, 0, 65503, 42226,
 
            32696, 0, 0, 0, 0, 0, 19304, 42256, 32696, 0, 1, 0, 0, 16368, 0, 0, 0, 0, 1, 0, 32764, 0, 37200, 42260,
32696,0, 0, 0, 0, 0, 42289, 152, 0, 0, 53392, 406, 0, 0, 11240, 408, 0, 0, 0, 0, 0, 0, 38056, 42260, 32696,
 
            0, 36976, 8794, 32764, 0, 36960, 8794, 32764, 0, 63057, 4137, 0, 0, 14860, 69, 0, 0, 65535, 65535, 0, 0, 3,
0,0, 0, 32664, 41847, 32696, 0, 54472, 42260, 32696, 0, 0, 0, 0, 16368, 0, 0, 0, 16400, 0, 0, 0, 0,
 
            5243, 18350, 31457, 16228...}, nowunused = {0, 32000, 46500, 10922, 0, 11, 0, 0, 0, 17, 2560, 0, 0, 61105,
6,0, 0, 37544, 8794, 32764, 0, 28552, 408, 0, 0, 0, 0, 0, 0, 52144, 397, 0, 0, 8096, 0, 65535, 2,
 
            0 <repeats 98 times>, 11, 0, 10, 0, 4, 25089, 78, 11265, 0, 0, 0, 0, 0, 0, 1007, 0, 42, 0, 43, 0, 2406, 0,
2407,0, 0, 0, 0, 0, 0, 0, 32639, 32639, 32639, 32639, 32639, 32639, 32639, 32639, 36353, 33032, 24272,
 
            16340, 0, 0, 0, 0, 0, 0, 0, 16368, 0, 0, 0, 0, 0, 0, 18136, 16663, 0, 0, 0, 0, 0, 49152, 18127, 16359, 0,
0,0, 0, 59295, 42150, 63299, 48114, 0, 0, 0, 0, 28399, 5317, 64332, 48184, 0, 0, 0, 0, 8832, 36553, 22255,
 
            16426, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 55168, 390, 0, 0, 64, 0, 0, 0, 231, 0, 0, 0, 38652,
8794,32764, 0, 38640, 8794, 32764, 0, 0, 0, 0, 0, 3092, 156, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 6,
 
            0 <repeats 13 times>}, marked = {false <repeats 292 times>},
          htsv =
"\340Xm&\343:\367/$\254\001\357\335@*q\212^D\032\070\327\004\347!6\231\275\347Ac\207\207\236\262\211%2(\373\303\240U[\207\274\033\246\374{T\355\331\177\311\310m\222\200",
'\000'<repeats 17 times>,
"\220\314\277MH<\262\000\000\000\000\000\221\022\000\000\065A\334\364\370>\262\000\000\000\000\000\000\024%\354\065A\334\364\330\227\035\263\252*\000\000\300\227\035\263\252*\000\000\000\375\245\265\252*\000\000\200\301\300\252\252*\000\000P\347\226\001\000\000\000\000\330\227\035\263\252*\000\000\375\017\233\260\000\000\000\000\277\303\200\000\000\000\000\000`\213\223\001\000\000\000\000\333\227\035\263\252*\000\000\000\000\000\000L\v\000\000\000\000\000\000\000\000\000\000"...}
        tup = {t_len = 596, t_self = {ip_blkid = {bi_hi = 0, bi_lo = 2892}, ip_posid = 9}, t_tableOid = 17852325,
t_data= 0x2aaab5a5fdf8}
 
#6  0x0000000000502326 in heap_page_prune_opt (relation=0x7fb8a50c3438, buffer=buffer@entry=5100) at pruneheap.c:209
        ndeleted = <optimized out>
        nnewlpdead = -186891979
        page = 0x2aaab5a5fd00 <Address 0x2aaab5a5fd00 out of bounds>
        prune_xid = <optimized out>
        vistest = 0xec7890 <GlobalVisDataRels>
        limited_xmin = 0
        limited_ts = 0
        minfree = 819
#7  0x00000000004f3ae3 in heapgetpage (sscan=sscan@entry=0x199b1d0, page=page@entry=2892) at heapam.c:415
        scan = 0x199b1d0
        buffer = 5100
        snapshot = 0x188ef18
        lines = <optimized out>
        ntup = <optimized out>
        lineoff = <optimized out>
        lpp = <optimized out>
        all_visible = <optimized out>
#8  0x00000000004f44c2 in heapgettup_pagemode (scan=scan@entry=0x199b1d0, dir=<optimized out>, nkeys=0, key=0x0) at
heapam.c:1120
        tuple = 0x199b228
        backward = false
        page = 2892
        finished = false
        dp = 0x2aaab5a5dd00 <Address 0x2aaab5a5dd00 out of bounds>
        lines = <optimized out>
        lineindex = 8
        lineoff = <optimized out>
        linesleft = 0
        lpp = <optimized out>
#9  0x00000000004f5abe in heap_getnextslot (sscan=0x199b1d0, direction=<optimized out>, slot=0x1967be8) at
heapam.c:1352

-- 
Justin



pgsql-hackers by date:

Previous
From: Amit Kapila
Date:
Subject: Re: why can't a table be part of the same publication as its schema
Next
From: Bharath Rupireddy
Date:
Subject: Re: Assertion failure in WaitForWALToBecomeAvailable state machine