Thread: New sessions on a database to be dropped consume 100% cpu

New sessions on a database to be dropped consume 100% cpu

From
Feike Steenbergen
Date:
While dropping a database, I was unable to do so because of a connected session. I used pg_terminate_backend to try to get rid of the session, however it is unkillable (from postgres). The time of connecting this sessions seems to 

Top output for this process looks like this:

 8496 postgres  20   0 39.2g 6072 4084 R 100.0  0.0  15:39.93 postgres: clustername: postgres name_to_be_dropped [local] CREATE TABLE

strace output looks like this:

semop(20709497, {{10, -1, 0}}, 1)       = 0
semop(20709497, {{10, -1, 0}}, 1)       = 0
semop(20709497, {{10, -1, 0}}, 1)       = 0
semop(20709497, {{10, -1, 0}}, 1)       = 0
semop(20709497, {{10, -1, 0}}, 1)       = 0
semop(20709497, {{10, -1, 0}}, 1)       = 0
semop(20709497, {{10, -1, 0}}, 1)       = 0
semop(20709497, {{10, -1, 0}}, 1)       = 0

ltrace output looks like this, it seems to loop:

memcmp(0x7fb6cb720d30, 0x7ffc9c857080, 20, 104, 0)                                                              = 0
memcmp(0x7fb6d0a5d0d8, 0x7ffc9c857080, 20, 30219, 0)                                                            = 0
memcmp(0x7fb6cf2c5bd8, 0x7ffc9c857080, 20, 12097, 0)                                                            = 0
memcmp(0x7fb6d398da68, 0x7ffc9c857080, 20, 23156, 0)                                                            = 0
memcmp(0x7fb6cdf1a6d0, 0x7ffc9c857080, 20, 23399, 0)                                                            = 0
memcmp(0x7fb6ca552740, 0x7ffc9c857080, 20, 26768, 0)                                                            = 0
memcmp(0x7fb6d2d4ea48, 0x7ffc9c857080, 20, 23160, 0)                                                            = 0
memcmp(0x7fb6d22020c0, 0x7ffc9c857080, 20, 103, 0)                                                              = 0
memcmp(0x7fb6cb13f370, 0x7ffc9c857080, 20, 22652, 0)                                                            = 0
memcmp(0x7fb6c8cf6f08, 0x7ffc9c857080, 20, 20672, 0)                                                            = 0
memcmp(0x7fb6cf8155d0, 0x7ffc9c857080, 20, 13922, 0)                                                            = 0
memcmp(0x7fb6d063bd60, 0x7ffc9c857080, 20, 5665, 0)                                                             = 0
memcmp(0x7fb6c8cfcf20, 0x7ffc9c857080, 20, 11229, 0)                                                            = 0
memcmp(0x7fb6cc8f2d68, 0x7ffc9c857080, 20, 13217, 0)                                                            = 0
memcmp(0x7fb6c99a89b0, 0x7ffc9c857080, 20, 16231, 0)                                                            = 0
memcmp(0x7fb6d0fff280, 0x7ffc9c857080, 20, 19748, 0)                                                            = 0
memcmp(0x7fb6d0aa9b78, 0x7ffc9c857080, 20, 4145, 0)                                                             = 0
memcmp(0x7fb6cd49f140, 0x7ffc9c857080, 20, 2674, 0)                                                             = 0
memcmp(0x7fb6d1e0f300, 0x7ffc9c857080, 20, 18625, 0)                                                            = 0
memcmp(0x7fb6cb75cc40, 0x7ffc9c857080, 20, 6088, 0)                                                             = 0
memcmp(0x7fb6d3fb37b8, 0x7ffc9c857080, 20, 14026, 0)                                                            = 0
memcmp(0x7fb6c87ece68, 0x7ffc9c857080, 20, 23769, 0)                                                            = 0
memcmp(0x7fb6ccee60e0, 0x7ffc9c857080, 20, 11005, 0)                                                            = 0
memcmp(0x7fb6d0a35f88, 0x7ffc9c857080, 20, 28027, 0)                                                            = 0
memcmp(0x7fb6cc2cfd40, 0x7ffc9c857080, 20, 28376, 0)                                                            = 0
memcmp(0x7fb6d3fb3858, 0x7ffc9c857080, 20, 10121, 0)                                                            = 0
memcmp(0x7fb6cd4a1a80, 0x7ffc9c857080, 20, 10335, 0)                                                            = 0
memcmp(0x7fb6cb720d30, 0x7ffc9c857080, 20, 104, 0)                                                              = 0
memcmp(0x7fb6d0a5d0d8, 0x7ffc9c857080, 20, 30219, 0)                                                            = 0
memcmp(0x7fb6cf2c5bd8, 0x7ffc9c857080, 20, 12097, 0)                                                            = 0
memcmp(0x7fb6d398da68, 0x7ffc9c857080, 20, 23156, 0)                                                            = 0
memcmp(0x7fb6cdf1a6d0, 0x7ffc9c857080, 20, 23399, 0)                                                            = 0
memcmp(0x7fb6ca552740, 0x7ffc9c857080, 20, 26768, 0)                                                            = 0
memcmp(0x7fb6d2d4ea48, 0x7ffc9c857080, 20, 23160, 0)                                                            = 0
memcmp(0x7fb6d22020c0, 0x7ffc9c857080, 20, 103, 0)                                                              = 0
memcmp(0x7fb6cb13f370, 0x7ffc9c857080, 20, 22652, 0)                                                            = 0
memcmp(0x7fb6c8cf6f08, 0x7ffc9c857080, 20, 20672, 0)                                                            = 0
memcmp(0x7fb6cf8155d0, 0x7ffc9c857080, 20, 13922, 0)                                                            = 0
memcmp(0x7fb6d063bd60, 0x7ffc9c857080, 20, 5665, 0)                                                             = 0
memcmp(0x7fb6c8cfcf20, 0x7ffc9c857080, 20, 11229, 0)                                                            = 0
memcmp(0x7fb6cc8f2d68, 0x7ffc9c857080, 20, 13217, 0)                                                            = 0
memcmp(0x7fb6c99a89b0, 0x7ffc9c857080, 20, 16231, 0)                                                            = 0
memcmp(0x7fb6d0fff280, 0x7ffc9c857080, 20, 19748, 0)                                                            = 0
memcmp(0x7fb6d0aa9b78, 0x7ffc9c857080, 20, 4145, 0)                                                             = 0
memcmp(0x7fb6cd49f140, 0x7ffc9c857080, 20, 2674, 0)                                                             = 0
memcmp(0x7fb6d1e0f300, 0x7ffc9c857080, 20, 18625, 0)                                                            = 0
memcmp(0x7fb6cb75cc40, 0x7ffc9c857080, 20, 6088, 0)                                                             = 0
memcmp(0x7fb6d3fb37b8, 0x7ffc9c857080, 20, 14026, 0)                                                            = 0
memcmp(0x7fb6c87ece68, 0x7ffc9c857080, 20, 23769, 0)                                                            = 0
memcmp(0x7fb6ccee60e0, 0x7ffc9c857080, 20, 11005, 0)                                                            = 0
memcmp(0x7fb6d0a35f88, 0x7ffc9c857080, 20, 28027, 0)                                                            = 0
memcmp(0x7fb6cc2cfd40, 0x7ffc9c857080, 20, 28376, 0)                                                            = 0
memcmp(0x7fb6d3fb3858, 0x7ffc9c857080, 20, 10121, 0)                                                            = 0
memcmp(0x7fb6cd4a1a80, 0x7ffc9c857080, 20, 10335, 0)                                                            = 0

gdb output looks like this:

NewPrivateRefCountEntry (buf=0x7fad228e9300, strategy=0x0) at bufmgr.c:265
265        res->refcount = 0;
#0  NewPrivateRefCountEntry (buf=0x7fad228e9300, strategy=0x0) at bufmgr.c:265
#1  PinBuffer (buf=0x7fad228e9300, strategy=0x0) at bufmgr.c:1585
#2  0x00000000006bfd3c in BufferAlloc (smgr=<value optimized out>, relpersistence=112 'p', forkNum=MAIN_FORKNUM, blockNum=322, mode=RBM_NORMAL, strategy=0x0, hit=0x7ffc9c85710f "") at bufmgr.c:1235
#3  ReadBuffer_common (smgr=<value optimized out>, relpersistence=112 'p', forkNum=MAIN_FORKNUM, blockNum=322, mode=RBM_NORMAL, strategy=0x0, hit=0x7ffc9c85710f "") at bufmgr.c:745
#4  0x00000000006c0720 in ReadBufferExtended (reln=0x7fad1647f920, forkNum=MAIN_FORKNUM, blockNum=322, mode=<value optimized out>, strategy=<value optimized out>) at bufmgr.c:664
#5  0x00000000004b154c in _bt_relandgetbuf (rel=0x7fad1647f920, obuf=232570, blkno=<value optimized out>, access=1) at nbtpage.c:712
#6  0x00000000004b5b40 in _bt_moveright (rel=0x7fad1647f920, buf=232570, keysz=3, scankey=0x1d1ac18, nextkey=<value optimized out>, forupdate=1 '\001', stack=0x1d1aa68, access=1, snapshot=0x0) at nbtsearch.c:277
#7  0x00000000004b5db3 in _bt_search (rel=0x7fad1647f920, keysz=3, scankey=0x1d1ac18, nextkey=0 '\000', bufP=0x7ffc9c8572d8, access=<value optimized out>, snapshot=0x0) at nbtsearch.c:129
#8  0x00000000004b0bd9 in _bt_doinsert (rel=0x7fad1647f920, itup=0x1cfee88, checkUnique=UNIQUE_CHECK_NO, heapRel=0x7fad1647f1c0) at nbtinsert.c:122
#9  0x00000000004b4508 in btinsert (rel=0x7fad1647f920, values=<value optimized out>, isnull=<value optimized out>, ht_ctid=0x1cfeebc, heapRel=0x7fad1647f1c0, checkUnique=UNIQUE_CHECK_NO) at nbtree.c:284
#10 0x00000000004fe80f in CatalogIndexInsert (indstate=<value optimized out>, heapTuple=0x1cfeeb8) at indexing.c:136
#11 0x0000000000516e2b in recordMultipleDependencies (depender=0x7ffc9c8575d0, referenced=<value optimized out>, nreferenced=1, behavior=<value optimized out>) at pg_depend.c:116
#12 0x000000000051ddc7 in GenerateTypeDependencies (typeNamespace=<value optimized out>, typeObjectId=147632822, relationOid=147632820, relationKind=114 'r', owner=<value optimized out>, inputProcedure=<value optimized out>, outputProcedure=2291, receiveProcedure=2402, sendProcedure=2403, typmodinProcedure=0, typmodoutProcedure=0, analyzeProcedure=0, elementType=0, isImplicitArray=0 '\000', baseType=0,
    typeCollation=0, defaultExpr=0x0, rebuild=0 '\000') at pg_type.c:644
#13 0x000000000051e646 in TypeCreate (newTypeOid=0, typeName=0x7ffc9c857c00 "\210\342\317\001", typeNamespace=45964026, relationOid=147632820, relationKind=114 'r', ownerId=10, internalSize=-1, typeType=99 'c', typeCategory=67 'C', typePreferred=0 '\000', typDelim=44 ',', inputProcedure=2290, outputProcedure=2291, receiveProcedure=2402, sendProcedure=2403, typmodinProcedure=0, typmodoutProcedure=0,
    analyzeProcedure=0, elementType=0, isImplicitArray=0 '\000', arrayType=147632821, baseType=0, defaultTypeValue=0x0, defaultTypeBin=0x0, passedByValue=0 '\000', alignment=100 'd', storage=120 'x', typeMod=-1, typNDims=0, typeNotNull=0 '\000', typeCollation=0) at pg_type.c:471
#14 0x00000000004f9d0d in AddNewRelationType (relname=0x7ffc9c857c50 "defaultstocheck", relnamespace=45964026, reltablespace=0, relid=147632820, reltypeid=0, reloftypeid=0, ownerid=10, tupdesc=0x1cfead8, cooked_constraints=0x0, relkind=114 'r', relpersistence=116 't', shared_relation=0 '\000', mapped_relation=0 '\000', oidislocal=0 '\000', oidinhcount=0, oncommit=ONCOMMIT_NOOP, reloptions=0, use_user_acl=1 '\001',
    allow_system_table_mods=0 '\000', is_internal=0 '\000', typaddress=0x0) at heap.c:950
#15 heap_create_with_catalog (relname=0x7ffc9c857c50 "defaultstocheck", relnamespace=45964026, reltablespace=0, relid=147632820, reltypeid=0, reloftypeid=0, ownerid=10, tupdesc=0x1cfead8, cooked_constraints=0x0, relkind=114 'r', relpersistence=116 't', shared_relation=0 '\000', mapped_relation=0 '\000', oidislocal=0 '\000', oidinhcount=0, oncommit=ONCOMMIT_NOOP, reloptions=0, use_user_acl=1 '\001',
    allow_system_table_mods=0 '\000', is_internal=0 '\000', typaddress=0x0) at heap.c:1198
#16 0x00000000005adeaa in DefineRelation (stmt=0x1d02af8, relkind=114 'r', ownerId=10, typaddress=0x0) at tablecmds.c:661
#17 0x00000000006f0879 in ProcessUtilitySlow (parsetree=0x1d92050, queryString=0x1d90b88 "CREATE TEMP TABLE DefaultsToCheck (\nkind CHAR(1) NOT NULL,\nname VARCHAR NOT NULL,\nfrom_version integer NULL\n);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, dest=<value optimized out>, completionTag=0x7ffc9c8584b0 "") at utility.c:980
#18 0x00000000006f0cc4 in standard_ProcessUtility (parsetree=0x1d92050, queryString=0x1d90b88 "CREATE TEMP TABLE DefaultsToCheck (\nkind CHAR(1) NOT NULL,\nname VARCHAR NOT NULL,\nfrom_version integer NULL\n);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, dest=<value optimized out>, completionTag=0x7ffc9c8584b0 "") at utility.c:907
#19 0x00007fad170e471c in ?? () from /usr/pgsql-9.6/lib/pglogical.so
#20 0x00007fad16a33ecd in pgss_ProcessUtility (parsetree=0x1d92050, queryString=0x1d90b88 "CREATE TEMP TABLE DefaultsToCheck (\nkind CHAR(1) NOT NULL,\nname VARCHAR NOT NULL,\nfrom_version integer NULL\n);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, dest=0x1d923f0, completionTag=0x7ffc9c8584b0 "") at pg_stat_statements.c:1047
#21 0x00000000006ed329 in PortalRunUtility (portal=0x1cd2108, utilityStmt=0x1d92050, isTopLevel=1 '\001', setHoldSnapshot=<value optimized out>, dest=0x1d923f0, completionTag=0x7ffc9c8584b0 "") at pquery.c:1193
#22 0x00000000006ee1f4 in PortalRunMulti (portal=0x1cd2108, isTopLevel=1 '\001', setHoldSnapshot=0 '\000', dest=0x1d923f0, altdest=0x1d923f0, completionTag=0x7ffc9c8584b0 "") at pquery.c:1349
#23 0x00000000006ee9ef in PortalRun (portal=0x1cd2108, count=9223372036854775807, isTopLevel=1 '\001', dest=0x1d923f0, altdest=0x1d923f0, completionTag=0x7ffc9c8584b0 "") at pquery.c:815
#24 0x00000000006eb659 in exec_simple_query (query_string=0x1d90b88 "CREATE TEMP TABLE DefaultsToCheck (\nkind CHAR(1) NOT NULL,\nname VARCHAR NOT NULL,\nfrom_version integer NULL\n);") at postgres.c:1086
#25 0x00000000006ecd1c in PostgresMain (argc=<value optimized out>, argv=<value optimized out>, dbname=0x1d05748 "name_to_be_dropped", username=<value optimized out>) at postgres.c:4072
#26 0x000000000068acef in BackendRun (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:4342
#27 BackendStartup (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:4016
#28 ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1721
#29 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1329
#30 0x00000000006104a9 in main (argc=3, argv=0x1cb2b10) at main.c:228
Detaching from program: /usr/pgsql-9.6/bin/postgres, process 8496

I now have 3 sessions, each doing the same thing, triggered by a cron job, connected to this database and acting as if they are unkillable by pg_terminate_backend.

I guess this is a bug somewhere.

I disabled connections to the database altogether for now, so new sessions should not be created.

I currently still have 3 of these sessions running as I'd like to troubleshoot more to see what causes this. Also, I guess my only way out is now to restart postgresql altogether.
I can keep these processes running for a day or so to gather more data.

Operating System: CentOS 6.10
PostgreSQL: 9.6.10

regards,

Feike Steenbergen

Re: New sessions on a database to be dropped consume 100% cpu

From
Tom Lane
Date:
Feike Steenbergen <feikesteenbergen@gmail.com> writes:
> While dropping a database, I was unable to do so because of a connected
> session. I used pg_terminate_backend to try to get rid of the session,
> however it is unkillable (from postgres).

The stack trace shows that it's stuck trying to make index entries for
some pg_depend additions.  I'm wondering if one of the indexes for
pg_depend is corrupt (infinite loop of page right-links, say).
If you could install amcheck and see what it thinks about those indexes,
that would be interesting.

Of course, if you're just wishing you could get rid of the database,
you could forcibly kill those processes and move on.

            regards, tom lane


Re: New sessions on a database to be dropped consume 100% cpu

From
Feike Steenbergen
Date:
> If you could install amcheck and see what it thinks about those indexes,
> that would be interesting.

I created a backup (using pg_basebackup) from this cluster and had a look:

name_to_be_dropped=# select bt_index_check('pg_catalog.pg_depend_depender_index');
ERROR:  cross page item order invariant violated for index "pg_depend_depender_index"
DETAIL:  Last item on page tid=(255,138) page lsn=33A5/F47BC3B0.

> Of course, if you're just wishing you could get rid of the database,
> you could forcibly kill those processes and move on.

Unfortunately other databases are still serving a useful purpose on the same cluster
and it seems my only recourse would be restart the whole cluster, as I currently can't
kill them using SIGTERM.

Thanks,

Feike

Re: New sessions on a database to be dropped consume 100% cpu

From
Peter Geoghegan
Date:
On Mon, Nov 26, 2018 at 1:12 AM Feike Steenbergen
<feikesteenbergen@gmail.com> wrote:
> I created a backup (using pg_basebackup) from this cluster and had a look:
>
> name_to_be_dropped=# select bt_index_check('pg_catalog.pg_depend_depender_index');
> ERROR:  cross page item order invariant violated for index "pg_depend_depender_index"
> DETAIL:  Last item on page tid=(255,138) page lsn=33A5/F47BC3B0.

What does bt_index_parent_check() say? It might be different.

For some reason the pg_depend indexes are very well represented when
there is corruption at the storage level, so this doesn't necessarily
tell us much. I'd say that there is a very good chance that the
sibling page of block 255 contains a page image that isn't even a
B-Tree page at all (could be a heap block, could be garbage). I've
seen things like that in pg_depend index blocks quite a number of
times.

You could use a tool like pg_filedump to figure this out -- that
doesn't assume anything about the type of block it has been passed
(pg_hexedit would also work, and is helpful if you want a more visual
representation). You'd need to figure out the sibling block using
pageinspect, and from there you could use pg_filedump to dump that
block. This probably isn't going to help you to figure out what
happened, but it might.

-- 
Peter Geoghegan