Re: Max connections reached without max connections reached - Mailing list pgsql-general

From Dilip Kumar
Subject Re: Max connections reached without max connections reached
Date
Msg-id CAFiTN-sYJuxkMyPRqg1jdyAQt4zRDe39ztoS7zq0BANRpGkMUQ@mail.gmail.com
Whole thread Raw
In response to Re: Max connections reached without max connections reached  (James Sewell <james.sewell@jirotech.com>)
Responses Re: Max connections reached without max connections reached
Re: Max connections reached without max connections reached
List pgsql-general
On Thu, Nov 25, 2021 at 8:58 AM James Sewell <james.sewell@jirotech.com> wrote:
>>
>> The hypothesis I'm thinking of is that incoming sessions are being blocked
>> somewhere before they can acquire a ProcArray entry; if so, they'd not
>> show up in either pg_stat_activity or pg_locks.  What we have to look for
>> then is evidence of somebody holding a strong lock on a shared relation.
>> Try "select * from pg_locks where locktype = 'relation' and database = 0".
>>
>
> That doesn't show anything when the issue is happening.
>
> Strace shows the startup processes looping like so:
>
> 13:51:15.510850 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=733559666}) = 0
> 13:51:15.511254 futex(0x7ee891405238, FUTEX_WAKE, 1) = 1
> 13:51:15.511422 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=734037545}) = 0
> 13:51:15.511690 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.520470 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=743136450}) = 0
> 13:51:15.520772 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=743376746}) = 0
> 13:51:15.521005 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.539309 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=761920081}) = 0
> 13:51:15.539508 futex(0x7ee891406bb8, FUTEX_WAKE, 1) = 1
> 13:51:15.539690 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=762298276}) = 0
> 13:51:15.539882 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.547311 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=769929341}) = 0
> 13:51:15.547757 futex(0x7ee89140a738, FUTEX_WAKE, 1) = 1
> 13:51:15.547931 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=770524120}) = 0
> 13:51:15.548089 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.551482 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
> 13:51:15.551950 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.593631 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
> 13:51:15.593807 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.626167 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.627590 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=850204434}) = 0
> 13:51:15.627817 futex(0x7ee8914033b8, FUTEX_WAKE, 1) = 1
> 13:51:15.627987 open("pg_subtrans/A100", O_RDWR) = 16
> 13:51:15.628270 lseek(16, 131072, SEEK_SET) = 131072
> 13:51:15.628461 read(16, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
> 13:51:15.628677 close(16)               = 0
> 13:51:15.628911 open("pg_subtrans/A0E1", O_RDWR) = 16
> 13:51:15.629132 lseek(16, 122880, SEEK_SET) = 122880
> 13:51:15.629309 read(16, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
> 13:51:15.629503 close(16)               = 0
> 13:51:15.629685 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=852272645}) = 0
> 13:51:15.629863 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.656694 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=879314944}) = 0
> 13:51:15.656923 futex(0x7ee8914125b8, FUTEX_WAKE, 1) = 1
> 13:51:15.657129 open("pg_subtrans/A0EE", O_RDWR) = 16
> 13:51:15.657385 lseek(16, 73728, SEEK_SET) = 73728
> 13:51:15.657638 read(16, "\0\0\0\0\0H\356\240\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5H\356\240\0\0\0\0"..., 8192) = 8192
> 13:51:15.657907 close(16)               = 0
> 13:51:15.658141 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=880753952}) = 0
> 13:51:15.658346 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.689705 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=912331058}) = 0
> 13:51:15.689950 futex(0x7ee89140a0b8, FUTEX_WAKE, 1) = 1
> 13:51:15.690182 futex(0x7ee8914118b8, FUTEX_WAKE, 1) = 1
> 13:51:15.690420 open("pg_subtrans/A0EE", O_RDWR) = 16
> 13:51:15.690742 lseek(16, 73728, SEEK_SET) = 73728
> 13:51:15.691037 read(16, "\0\0\0\0\0H\356\240\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5H\356\240\0\0\0\0"..., 8192) = 8192
> 13:51:15.691303 close(16)               = 0
> 13:51:15.691582 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=914202777}) = 0
> 13:51:15.691837 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.713319 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.716423 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=939067321}) = 0
> 13:51:15.716737 futex(0x7ee89140a0b8, FUTEX_WAKE, 1) = 1
> 13:51:15.717010 futex(0x7ee89140f3b8, FUTEX_WAKE, 1) = 1
> 13:51:15.717286 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=939899218}) = 0
> 13:51:15.717578 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.722270 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0
> 13:51:15.746391 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=969020657}) = 0
> 13:51:15.746664 futex(0x7ee89140fa38, FUTEX_WAKE, 1) = 1
> 13:51:15.746877 open("pg_subtrans/A0EE", O_RDWR) = 16
>
>
> While gdb shows similar to:
>
>    24 ^[[?1034h(gdb) bt
>    23 #0  0x00007f18fb9deb3b in do_futex_wait.constprop.1 () from /lib64/libpthread.so.0
>    22 #1  0x00007f18fb9debcf in __new_sem_wait_slow.constprop.0 () from /lib64/libpthread.so.0
>    21 #2  0x00007f18fb9dec6b in sem_wait@@GLIBC_2.2.5 () from /lib64/libpthread.so.0
>    20 #3  0x000000000071c482 in PGSemaphoreLock ()
>    19 #4  0x000000000079078c in LWLockAcquire ()
>    18 #5  0x00000000004fa9ae in SimpleLruWaitIO.isra.1 ()
>    17 #6  0x00000000004fb2a4 in SimpleLruReadPage ()
>    16 #7  0x00000000004fbc07 in SubTransGetParent ()
>    15 #8  0x00000000004fbc57 in SubTransGetTopmostTransaction ()
>    14 #9  0x00000000008f65a7 in XidInMVCCSnapshot ()
>    13 #10 0x00000000008f662e in HeapTupleSatisfiesMVCC ()
>    12 #11 0x00000000004c436e in heapgetpage ()
>    11 #12 0x00000000004c58e4 in heap_getnext ()
>    10 #13 0x00000000008ab8b5 in ScanPgRelation ()
>     9 #14 0x00000000008ae124 in RelationBuildDesc ()
>     8 #15 0x00000000008b010e in load_critical_index ()
>     7 #16 0x00000000008b39ac in RelationCacheInitializePhase3 ()
>     6 #17 0x00000000008cbd56 in InitPostgres ()
>     5 #18 0x00000000007a1cfe in PostgresMain ()
>     4 #19 0x000000000048a9c0 in ServerLoop ()
>     3 #20 0x000000000072f11e in PostmasterMain ()
>     2 #21 0x000000000048b854 in main ()
>
> Does that shed any light?

Seems like some of the processes are taking a long time or stuck while
reading/writing SLRU pages, and due to that while creating a new
connection the backend process is not able to check the transaction
status (from pg_xlog) of the pg_class tuple and gets stuck/taking a
long time in a startup.

-- 
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com



pgsql-general by date:

Previous
From: James Sewell
Date:
Subject: Re: Max connections reached without max connections reached
Next
From: Dilip Kumar
Date:
Subject: Re: Max connections reached without max connections reached