Re: Need help debugging SIGBUS crashes - Mailing list pgsql-hackers

From Peter 'PMc' Much
Subject Re: Need help debugging SIGBUS crashes
Date
Msg-id ab7IuP9NApo1e8Nj@disp.intra.daemon.contact
Whole thread Raw
In response to Re: Need help debugging SIGBUS crashes  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
On Tue, Mar 17, 2026 at 04:56:48PM -0400, Tom Lane wrote:
! "Peter 'PMc' Much" <pmc@citylink.dinoex.sub.org> writes:
! > On Tue, Mar 17, 2026 at 10:12:07AM -0400, Tom Lane wrote:
! > ! Why it was okay in older FreeBSD and not so much in v14, who knows?
! 
! > Maybe it wasn't. Here it appeared out of thin air in February, while
! > the system was upgraded from 13.5 to 14.3 in July'25, and did run
! > without problems for these eight months.
! > So this is not directly or solely related to FBSD R.14, and while it
! > happens more likely during massive memory use, but this also is not
! > stingent. Neither did I find any other solid determining condition.
! 
! Yeah, it seems likely that there is some additional triggering
! condition that we don't understand; otherwise there would be more
! people complaining than just you.  But if updating to PG16 gets
! rid of the problem, I'm not sure it is worth the time to try to
! narrow down what that additional trigger is.
! 
! Of course, if you still see the issue after upgrading, we'll have
! to dig harder.


Sadly, here it is again with PG r16.13, at the same place as before.


* thread #1, name = 'postgres', stop reason = signal SIGBUS
  * frame #0: 0x000000082bba3159 libc.so.7`extent_arena_get [inlined] extent_arena_ind_get(extent=0x79f696918ed45a56)
atextent_inlines.h:40:23
 
    frame #1: 0x000000082bba3159 libc.so.7`extent_arena_get(extent=0x79f696918ed45a56) at extent_inlines.h:49:23
    frame #2: 0x000000082bba3a14 libc.so.7`extent_can_coalesce(arena=0x00003d43fd800980, extents=0x00003d43fd8058d8,
inner=0x00003d43fd90f080,outer=0x79f696918ed45a56) at jemalloc_extent.c:1565:6
 
    frame #3: 0x000000082bba363b libc.so.7`extent_try_coalesce_impl(tsdn=0x00003d43fd67a090, arena=0x00003d43fd800980,
r_extent_hooks=0x0000000820af5198,rtree_ctx=0x00003d43fd67a0c0, extents=0x00003d43fd8058d8, extent=0x00003d43fd90f080,
coalesced=0x0000000000000000,growing_retained=true, inactive_only=false) at jemalloc_extent.c:1628:24
 
    frame #4: 0x000000082bba3448 libc.so.7`extent_try_coalesce(tsdn=0x00003d43fd67a090, arena=0x00003d43fd800980,
r_extent_hooks=0x0000000820af5198,rtree_ctx=0x00003d43fd67a0c0, extents=0x00003d43fd8058d8, extent=0x00003d43fd90f080,
coalesced=0x0000000000000000,growing_retained=true) at jemalloc_extent.c:1680:9
 
    frame #5: 0x000000082bba055f libc.so.7`extent_record(tsdn=0x00003d43fd67a090, arena=0x00003d43fd800980,
r_extent_hooks=0x0000000820af5198,extents=0x00003d43fd8058d8, extent=0x00003d43fd90f080, growing_retained=true) at
jemalloc_extent.c:1719:12
    frame #6: 0x000000082bba6043 libc.so.7`extent_grow_retained(tsdn=0x00003d43fd67a090, arena=0x00003d43fd800980,
r_extent_hooks=0x0000000820af5198,size=65536, pad=4096, alignment=64, slab=false, szind=44, zero=0x0000000820af51ef,
commit=0x0000000820af5197)at jemalloc_extent.c:1385:4
 
    frame #7: 0x000000082bba0f3f libc.so.7`extent_alloc_retained(tsdn=0x00003d43fd67a090, arena=0x00003d43fd800980,
r_extent_hooks=0x0000000820af5198,new_addr=0x0000000000000000, size=65536, pad=4096, alignment=64, slab=false,
szind=44,zero=0x0000000820af51ef, commit=0x0000000820af5197) at jemalloc_extent.c:1482:12
 
    frame #8: 0x000000082bba0d39 libc.so.7`__je_extent_alloc_wrapper(tsdn=0x00003d43fd67a090, arena=0x00003d43fd800980,
r_extent_hooks=0x0000000820af5198,new_addr=0x0000000000000000, size=65536, pad=4096, alignment=64, slab=false,
szind=44,zero=0x0000000820af51ef, commit=0x0000000820af5197) at jemalloc_extent.c:1541:21
 
    frame #9: 0x000000082bb7a87d libc.so.7`__je_arena_extent_alloc_large(tsdn=<unavailable>, arena=0x00003d43fd800980,
usize=65536,alignment=<unavailable>, zero=0x0000000820af51ef) at jemalloc_arena.c:448:12
 
    frame #10: 0x000000082bba77b0 libc.so.7`__je_large_palloc(tsdn=0x00003d43fd67a090, arena=<unavailable>,
usize=<unavailable>,alignment=64, zero=<unavailable>) at jemalloc_large.c:47:43
 
    frame #11: 0x000000082bba7612 libc.so.7`__je_large_malloc(tsdn=<unavailable>, arena=<unavailable>,
usize=<unavailable>,zero=<unavailable>) at jemalloc_large.c:17:9 [artificial]
 
    frame #12: 0x000000082bb7c477 libc.so.7`__je_arena_malloc_hard(tsdn=<unavailable>, arena=<unavailable>,
size=<unavailable>,ind=<unavailable>, zero=<unavailable>) at jemalloc_arena.c:1528:9 [artificial]
 
    frame #13: 0x000000082bb6f5a7 libc.so.7`__je_malloc_default [inlined] arena_malloc(tsdn=0x00003d43fd67a090,
arena=0x0000000000000000,size=<unavailable>, ind=<unavailable>, zero=false, tcache=0x00003d43fd67a280, slow_path=false)
atarena_inlines_b.h:176:9
 
    frame #14: 0x000000082bb6f598 libc.so.7`__je_malloc_default [inlined] iallocztm(tsdn=0x00003d43fd67a090,
size=<unavailable>,ind=<unavailable>, zero=false, tcache=0x00003d43fd67a280, is_internal=false,
arena=0x0000000000000000,slow_path=false) at jemalloc_internal_inlines_c.h:53:8
 
    frame #15: 0x000000082bb6f598 libc.so.7`__je_malloc_default [inlined] imalloc_no_sample(sopts=<unavailable>,
dopts=<unavailable>,tsd=0x00003d43fd67a090, size=<unavailable>, usize=65536, ind=<unavailable>) at
jemalloc_jemalloc.c:1953:9
    frame #16: 0x000000082bb6f598 libc.so.7`__je_malloc_default [inlined] imalloc_body(sopts=<unavailable>,
dopts=<unavailable>,tsd=0x00003d43fd67a090) at jemalloc_jemalloc.c:2153:16
 
    frame #17: 0x000000082bb6f598 libc.so.7`__je_malloc_default [inlined] imalloc(sopts=<unavailable>,
dopts=<unavailable>)at jemalloc_jemalloc.c:2262:10
 
    frame #18: 0x000000082bb6f4ca libc.so.7`__je_malloc_default(size=<unavailable>) at jemalloc_jemalloc.c:2293:2
    frame #19: 0x000000082bb6fa2d libc.so.7`__malloc(size=<unavailable>) at jemalloc_jemalloc.c:0 [artificial]
    frame #20: 0x000000082bad08a4 libc.so.7`_dns_gethostbyaddr(rval=0x0000000820af5a90, cb_data=<unavailable>,
ap=<unavailable>)at gethostbydns.c:619:13
 
    frame #21: 0x000000082badeab2 libc.so.7`_nsdispatch(retval=0x0000000820af5a90, disp_tab=0x000000082bbd8800,
database="",method_name="", defaults=<unavailable>) at nsdispatch.c:726:14
 
    frame #22: 0x000000082bad2be8 libc.so.7`gethostbyaddr_r(addr=0x0000000820af5ae0, len=<unavailable>,
af=<unavailable>,hp=0x000000082bbebda0, buf="", buflen=8800, result=0x0000000820af5a90, h_errnop=0x0000000820af5a8c) at
gethostnamadr.c:650:9
    frame #23: 0x000000082bad34f9 libc.so.7`gethostbyaddr(addr=0x0000000820af5ae0, len=16, af=28) at
gethostnamadr.c:700:6
    frame #24: 0x000000082baddcd8 libc.so.7`getipnodebyaddr(src=0x0000000820af5ae0, len=<unavailable>, af=28,
errp=0x0000000820af5b50)at name6.c:378:7
 
    frame #25: 0x000000082bad4242 libc.so.7`getnameinfo_inet(afd=0x000000082bbd8980, sa=0x00003d43fda5e098,
salen=<unavailable>,host=<unavailable>, hostlen=<unavailable>, serv=<unavailable>, servlen=0, flags=4) at
getnameinfo.c:311:8
    frame #26: 0x000000082bad405d libc.so.7`getnameinfo(sa=<unavailable>, salen=<unavailable>, host=<unavailable>,
hostlen=<unavailable>,serv=<unavailable>, servlen=<unavailable>, flags=4) at getnameinfo.c:157:10
 
    frame #27: 0x0000000000a85081 postgres`pg_getnameinfo_all + 177
    frame #28: 0x0000000000774262 postgres`hba_getauthmethod + 1202
    frame #29: 0x000000000076a412 postgres`ClientAuthentication + 50
    frame #30: 0x0000000000a49fd1 postgres`InitPostgres + 2273
    frame #31: 0x00000000008eac4d postgres`PostgresMain + 285
    frame #32: 0x0000000000857108 postgres`BackendRun + 40
    frame #33: 0x0000000000855a1a postgres`ServerLoop + 7866
    frame #34: 0x000000000085300e postgres`PostmasterMain + 3278
    frame #35: 0x000000000077bac3 postgres`main + 803
    frame #36: 0x000000082ba72edc libc.so.7`__libc_start1(argc=4, argv=0x0000000820af8700, env=0x0000000820af8728,
cleanup=<unavailable>,mainX=(postgres`main)) at libc_start1.c:180:7
 
    frame #37: 0x0000000000556de4 postgres`_start + 36


This is frame #3, and 'extent_t *next' does not seem to point to an
extent_t:

   1601 static extent_t *
   1602 extent_try_coalesce_impl(tsdn_t *tsdn, arena_t *arena,
   1603     extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
   1604     extent_t *extent, bool *coalesced, bool growing_retained,
   1605     bool inactive_only) {
   1606         /*
   1607          * We avoid checking / locking inactive neighbors for large size
   1608          * classes, since they are eagerly coalesced on deallocation which can
   1609          * cause lock contention.
   1610          */
   1611         /*
   1612          * Continue attempting to coalesce until failure, to protect against
   1613          * races with other threads that are thwarted by this one.
   1614          */
   1615         bool again;
   1616         do {
   1617                 again = false;
   1618
   1619                 /* Try to coalesce forward. */
   1620                 extent_t *next = extent_lock_from_addr(tsdn, rtree_ctx,
   1621                     extent_past_get(extent), inactive_only);
   1622                 if (next != NULL) {
   1623                         /*
   1624                          * extents->mtx only protects against races for
   1625                          * like-state extents, so call extent_can_coalesce()
   1626                          * before releasing next's pool lock.
   1627                          */
   1628                         bool can_coalesce = extent_can_coalesce(arena, extents,
   1629                             extent, next);


(lldb) p next
(extent_t *) 0x79f696918ed45a56
(lldb) p *next
error: Couldn't apply expression side effects : Couldn't dematerialize a result variable: couldn't read its memory
(lldb) p extent
(extent_t *) 0x00003d43fd90f080
(lldb) p *extent
(extent_t) {
  e_bits = 8796153896960
  e_addr = 0x00003d43fe211000
   = (e_size_esn = 2551808, e_bsize = 2551808)
  ql_link = {
    qre_next = 0x00003d43fd90f080
    qre_prev = 0x00003d43fd90f080
  }
  ph_link = {
    phn_prev = NULL
    phn_next = NULL
    phn_lchild = NULL
  }
   = {
    e_slab_data = {
      bitmap = ([0] = 0, [1] = 0, [2] = 0, [3] = 0, [4] = 0, [5] = 0, [6] = 0, [7] = 0)
    }
     = {
      e_alloc_time = (ns = 0)
      e_prof_tctx = (repr = 0x0000000000000000)
    }
  }
}



pgsql-hackers by date:

Previous
From: Tom Lane
Date:
Subject: Re: Persistent data across SETOF calls
Next
From: Amit Kapila
Date:
Subject: Re: Use SIGTERM instead of SIGUSR1 for slotsync worker to exit during promotion?