BUG #16331: segfault in checkpointer with full disk - Mailing list pgsql-bugs

From PG Bug reporting form
Subject BUG #16331: segfault in checkpointer with full disk
Date
Msg-id 16331-16d6e966536b6390@postgresql.org
Whole thread Raw
Responses Re: BUG #16331: segfault in checkpointer with full disk
List pgsql-bugs
The following bug has been logged on the website:

Bug reference:      16331
Logged by:          Jozef Mlich
Email address:      jmlich83@gmail.com
PostgreSQL version: 12.2
Operating system:   CentOS
Description:

I can see segfaults on CentOS 7 with postgresql 12.2-2PGDG.rhel7 (from
yum.postgresql.org). I am using multiple extensions  (cstore, postgres_fdw,
pgcrypto,dblink, etc.). It seems crash is related to disk run out of space
(I am using separate partion for / and for /var/lib/pgsql). It occurs few
times a day. According to backtrace it seems to be related to checkpointer.
Replication is not configured. 


[New LWP 26290]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `postgres: checkpointer
 '.
Program terminated with signal 6, Aborted.
#0  0x00007fe4604c1207 in __GI_raise (sig=sig@entry=6) at
../nptl/sysdeps/unix/sysv/linux/raise.c:55
55      return INLINE_SYSCALL (tgkill, 3, pid, selftid, sig);

Thread 1 (Thread 0x7fe462e148c0 (LWP 26290)):
#0  0x00007fe4604c1207 in __GI_raise (sig=sig@entry=6) at
../nptl/sysdeps/unix/sysv/linux/raise.c:55
        resultvar = 0
        pid = 26290
        selftid = 26290
#1  0x00007fe4604c28f8 in __GI_abort () at abort.c:90
        save_stage = 2
        act = {__sigaction_handler = {sa_handler = 0x0, sa_sigaction = 0x0},
sa_mask = {__val = {0, 0, 0, 0, 0, 9268713, 70403103920717,
39808819211026438, 20126216749056, 70394513997832, 9268713, 70403103920719,
17316096998686159616, 20134806683648, 140618848608704, 140618848592800}},
sa_flags = 1615828275, sa_restorer = 0x0}
        sigs = {__val = {32, 0 <repeats 15 times>}}
#2  0x000000000087840a in errfinish (dummy=<optimized out>) at elog.c:552
        edata = 0xd47040 <errordata>
        elevel = 22
        oldcontext = 0x171a6d0
        econtext = 0x0
        __func__ = "errfinish"
#3  0x0000000000706b24 in CheckPointReplicationOrigin () at origin.c:562
        tmppath = 0x9e6fa8 "pg_logical/replorigin_checkpoint.tmp"
        path = 0x9e6fd0 "pg_logical/replorigin_checkpoint"
        tmpfd = <optimized out>
        i = <optimized out>
        magic = 307747550
        crc = 4294967295
        __func__ = "CheckPointReplicationOrigin"
#4  0x0000000000509766 in CheckPointGuts (checkPointRedo=243199514360,
flags=flags@entry=256) at xlog.c:9007
No locals.
#5  0x0000000000510106 in CreateCheckPoint (flags=flags@entry=256) at
xlog.c:8795
        shutdown = false
        checkPoint = {redo = 243199514360, ThisTimeLineID = 1,
PrevTimeLineID = 1, fullPageWrites = true, nextFullXid = {value = 20251117},
nextOid = 9645082, nextMulti = 3, nextMultiOffset = 5, oldestXid = 479,
oldestXidDB = 14186, oldestMulti = 1, oldestMultiDB = 1, time = 1585692001,
oldestCommitTsXid = 0, newestCommitTsXid = 0, oldestActiveXid = 20251117}
        recptr = <optimized out>
        _logSegNo = 0
        Insert = <optimized out>
        freespace = <optimized out>
        PriorRedoPtr = <optimized out>
        curInsert = 243199514360
        last_important_lsn = <optimized out>
        vxids = 0x17389d8
        nvxids = 0
        __func__ = "CreateCheckPoint"
#6  0x00000000006e4982 in CheckpointerMain () at checkpointer.c:481
        ckpt_performed = false
        do_restartpoint = <optimized out>
        flags = 256
        do_checkpoint = <optimized out>
        now = 1585692001
        elapsed_secs = <optimized out>
        cur_timeout = <optimized out>
        local_sigjmp_buf = {{__jmpbuf = {140732342117584,
-1758805613090239410, 2, 9211212, 0, 24183888, -1758805613174125490,
1759480033986153550}, __mask_was_saved = 1, __saved_mask = {__val =
{18446744066192964103, 0 <repeats 15 times>}}}}
        checkpointer_context = 0x171a6d0
        __func__ = "CheckpointerMain"
#7  0x000000000051fe75 in AuxiliaryProcessMain (argc=argc@entry=2,
argv=argv@entry=0x7ffecd42a480) at bootstrap.c:461
        progname = 0x8c8d4c "postgres"
        flag = <optimized out>
        userDoption = 0x0
        __func__ = "AuxiliaryProcessMain"
#8  0x00000000006eef70 in StartChildProcess (type=CheckpointerProcess) at
postmaster.c:5392
        pid = <optimized out>
        av = {0x8c8d4c "postgres", 0x7ffecd42a4d0 "-x4", 0x0, 0x0, 0x16c92e0
"postgres-12", 0x5 <Address 0x5 out of bounds>, 0x7ffecd42a400 "L\215\214",
0x6 <Address 0x6 out of bounds>, 0x0, 0x7fe33f1daff8 "@NL`\344\177"}
        ac = 2
        typebuf =
"-x4\000\376\177\000\000\000s\345\306y\"O\360\001\000\000\000]\201\240J\000\000\000\000\000\000\000"
#9  0x00000000006f02ba in reaper (postgres_signal_arg=<optimized out>) at
postmaster.c:2973
        save_errno = 4
        pid = <optimized out>
        exitstatus = 0
        __func__ = "reaper"
#10 <signal handler called>
No locals.
#11 0x00007fe46057ff53 in __select_nocancel () at
../sysdeps/unix/syscall-template.S:81
No locals.
#12 0x0000000000483484 in ServerLoop () at postmaster.c:1668
        timeout = {tv_sec = 59, tv_usec = 872935}
        rmask = {fds_bits = {120, 0 <repeats 15 times>}}
        selres = <optimized out>
        now = <optimized out>
        readmask = {fds_bits = {120, 0 <repeats 15 times>}}
        last_lockfile_recheck_time = 1585685433
        last_touch_time = 1585685433
        __func__ = "ServerLoop"
#13 0x00000000006f195f in PostmasterMain (argc=argc@entry=3,
argv=argv@entry=0x16c9300) at postmaster.c:1377
        opt = <optimized out>
        status = <optimized out>
        userDoption = <optimized out>
        listen_addr_saved = true
        i = <optimized out>
        output_config_variable = <optimized out>
        __func__ = "PostmasterMain"
#14 0x0000000000484dd3 in main (argc=3, argv=0x16c9300) at main.c:228
No locals.



Please let me know if I can provide some more information.

best regards,
 Jozef Mlich


pgsql-bugs by date:

Previous
From: Fujii Masao
Date:
Subject: Re: [BUG] non archived WAL removed during production crash recovery
Next
From: Julien Rouhaud
Date:
Subject: Re: BUG #16331: segfault in checkpointer with full disk