Thread: Server Crash while executing pg_replication_slot_advance (secondtime)

Server Crash while executing pg_replication_slot_advance (secondtime)

From
tushar
Date:

Hi,

Getting an another server crash against latest sources of v11  while executing pg_replication_slot_advance second time . This issue is also  reproducible  with the patch given at  <Link>

Test-Case scenario-

postgres=#  SELECT * FROM pg_create_logical_replication_slot('regression_slot1', 'test_decoding', true);
    slot_name     |    lsn   
------------------+-----------
 regression_slot1 | 0/4000258
(1 row)

postgres=# select * from pg_replication_slots;
    slot_name     |    plugin     | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn
------------------+---------------+-----------+--------+----------+-----------+--------+------------+------+--------------+-------------+---------------------
 regression_slot1 | test_decoding | logical   |  13220 | postgres | t         | t      |       8756 |      |          557 | 0/4000220   | 0/4000258
(1 row)

postgres=# select pg_switch_wal();
 pg_switch_wal
---------------
 0/4000270
(1 row)

postgres=# select pg_switch_wal();
 pg_switch_wal
---------------
 0/5000000
(1 row)

postgres=# SELECT end_lsn FROM pg_replication_slot_advance('regression_slot1', '0/5000000');
  end_lsn 
-----------
 0/5000000
(1 row)

postgres=# select * from pg_replication_slots;
    slot_name     |    plugin     | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn
------------------+---------------+-----------+--------+----------+-----------+--------+------------+------+--------------+-------------+---------------------
 regression_slot1 | test_decoding | logical   |  13220 | postgres | t         | t      |       8756 |      |          557 | 0/4000220   | 0/5000000
(1 row)

postgres=# select pg_switch_wal();
 pg_switch_wal
---------------
 0/5000078
(1 row)

postgres=# select pg_switch_wal();
 pg_switch_wal
---------------
 0/6000000
(1 row)

postgres=# SELECT end_lsn FROM pg_replication_slot_advance('regression_slot1', '0/6000000');
server closed the connection unexpectedly
    This probably means the server terminated abnormally
    before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.
!> \q

Stack trace -

Core was generated by `postgres: centos postgres [local] SELECT                      '.
Program terminated with signal 6, Aborted.
#0  0x0000003746e325e5 in raise () from /lib64/libc.so.6
Missing separate debuginfos, use: debuginfo-install glibc-2.12-1.192.el6.x86_64 keyutils-libs-1.4-5.el6.x86_64 krb5-libs-1.10.3-57.el6.x86_64 libcom_err-1.41.12-22.el6.x86_64 libselinux-2.0.94-7.el6.x86_64 openssl-1.0.1e-48.el6_8.4.x86_64 zlib-1.2.3-29.el6.x86_64
(gdb) bt
#0  0x0000003746e325e5 in raise () from /lib64/libc.so.6
#1  0x0000003746e33dc5 in abort () from /lib64/libc.so.6
#2  0x0000000000a11f8a in ExceptionalCondition (conditionName=0xad5f00 "!(((RecPtr) % 8192 >= (((uintptr_t) ((sizeof(XLogPageHeaderData))) + ((8) - 1)) & ~((uintptr_t) ((8) - 1)))))",
    errorType=0xad5eed "FailedAssertion", fileName=0xad5ee0 "xlogreader.c", lineNumber=243) at assert.c:54
#3  0x000000000055df1d in XLogReadRecord (state=0x2d859c0, RecPtr=83886080, errormsg=0x7ffd6a4359f8) at xlogreader.c:243
#4  0x00000000008516bc in pg_logical_replication_slot_advance (startlsn=83886080, moveto=100663296) at slotfuncs.c:370
#5  0x0000000000851a21 in pg_replication_slot_advance (fcinfo=0x7ffd6a435bb0) at slotfuncs.c:488
#6  0x00000000006defdc in ExecMakeTableFunctionResult (setexpr=0x2d5e260, econtext=0x2d5df58, argContext=0x2da97f0, expectedDesc=0x2d5f780, randomAccess=0 '\000') at execSRF.c:231
#7  0x00000000006f1782 in FunctionNext (node=0x2d5de40) at nodeFunctionscan.c:95
#8  0x00000000006de80d in ExecScanFetch (node=0x2d5de40, accessMtd=0x6f16cb <FunctionNext>, recheckMtd=0x6f1abd <FunctionRecheck>) at execScan.c:95
#9  0x00000000006de8ad in ExecScan (node=0x2d5de40, accessMtd=0x6f16cb <FunctionNext>, recheckMtd=0x6f1abd <FunctionRecheck>) at execScan.c:162
#10 0x00000000006f1b0a in ExecFunctionScan (pstate=0x2d5de40) at nodeFunctionscan.c:270
#11 0x00000000006dcd66 in ExecProcNodeFirst (node=0x2d5de40) at execProcnode.c:446
#12 0x00000000006d3c05 in ExecProcNode (node=0x2d5de40) at ../../../src/include/executor/executor.h:246
#13 0x00000000006d6559 in ExecutePlan (estate=0x2d5dc28, planstate=0x2d5de40, use_parallel_mode=0 '\000', operation=CMD_SELECT, sendTuples=1 '\001', numberTuples=0,
    direction=ForwardScanDirection, dest=0x2d63628, execute_once=1 '\001') at execMain.c:1721
#14 0x00000000006d41d7 in standard_ExecutorRun (queryDesc=0x2cce878, direction=ForwardScanDirection, count=0, execute_once=1 '\001') at execMain.c:361
#15 0x00000000006d3ff3 in ExecutorRun (queryDesc=0x2cce878, direction=ForwardScanDirection, count=0, execute_once=1 '\001') at execMain.c:304
#16 0x00000000008afeeb in PortalRunSelect (portal=0x2d0e838, forward=1 '\001', count=0, dest=0x2d63628) at pquery.c:932
#17 0x00000000008afb79 in PortalRun (portal=0x2d0e838, count=9223372036854775807, isTopLevel=1 '\001', run_once=1 '\001', dest=0x2d63628, altdest=0x2d63628,
    completionTag=0x7ffd6a436480 "") at pquery.c:773
#18 0x00000000008a9ba0 in exec_simple_query (query_string=0x2ca9438 "SELECT end_lsn FROM pg_replication_slot_advance('regression_slot1', '0/6000000');") at postgres.c:1120
#19 0x00000000008ade34 in PostgresMain (argc=1, argv=0x2cd4ec0, dbname=0x2cd4d20 "postgres", username=0x2ca5f58 "centos") at postgres.c:4144
#20 0x000000000080d11a in BackendRun (port=0x2cccd00) at postmaster.c:4412
#21 0x000000000080c88e in BackendStartup (port=0x2cccd00) at postmaster.c:4084
#22 0x0000000000808cbb in ServerLoop () at postmaster.c:1757
#23 0x0000000000808379 in PostmasterMain (argc=3, argv=0x2ca3eb0) at postmaster.c:1365
#24 0x000000000073d3d0 in main (argc=3, argv=0x2ca3eb0) at main.c:228
(gdb)

-- 
regards,tushar
EnterpriseDB  https://www.enterprisedb.com/
The Enterprise PostgreSQL Company

Re: Server Crash while executing pg_replication_slot_advance (second time)

From
Arseny Sher
Date:
Hello,

I confirm this bug. The idea is that while usually we start decoding
from safe data.restart_lsn point, here we don't care about consistent
snapshots and rush into decoding right away from data.confirmed_flush
(slotfunc.c:475). The latter points to the first page's header instead
of valid record if we log SWITCH record and confirm it without any
additional records (while doing this manually you'd better hurry up to
outrun xl_running_xacts slipping through). This can be reproduced with a
bit simpler sample:

SELECT * FROM pg_create_logical_replication_slot(
  'regression_slot1', 'test_decoding', true);
select pg_switch_wal();
-- ensure we are on clean segment and xl_running_xacts didn't slip yet
select pg_current_wal_lsn();
 pg_current_wal_lsn
--------------------
 0/2000000
(1 row)

-- set confirmed_flush_lsn to segment start and verify that
select pg_replication_slot_advance('regression_slot1', '0/6000000');
select slot_name, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn
  from pg_replication_slots;
    slot_name     | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn
------------------+------+--------------+-------------+---------------------
 regression_slot1 |      |          557 | 0/15D0EE8   | 0/2000000
(1 row)

-- insert some wal to advance GetFlushRecPtr
create table t (i int);
-- now start decoding from start of segment, boom
select pg_replication_slot_advance('regression_slot1', '0/6000000');
server closed the connection unexpectedly
    This probably means the server terminated abnormally
    before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.
: @:!>
: @:!>
: @:!> \q


Simple way to fix that is to start decoding traditionally from
restart_lsn which certainly points to valid record. Attached patch shows
the idea. However, I am not sure this is a right decision: decoding from
restart_lsn is slower since it lags behind confirmed_lsn; and we really
don't care about consistent snapshots in this function. Well, it would
be good if we assemble one on our way, serialize it and advance
restart_lsn -- and this is AFAIU the main reason we ever decode
something at all instead of just calling LogicalConfirmReceivedLocation
-- but that's not the main goal of the function.

Another approach is to notice pointer to page start and replace it
with ptr to first record, but that doesn't sound elegantly.

--
Arseny Sher
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

From d3db37bca231beb0081567ef818ac1ec852cbb1a Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sat, 17 Feb 2018 04:39:09 +0300
Subject: [PATCH] Start decoding from restart_lsn which definitely exists.

---
 src/backend/replication/slotfuncs.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 7302f36488..e7d0ef49bc 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -472,7 +472,10 @@ pg_replication_slot_advance(PG_FUNCTION_ARGS)
     /* Acquire the slot so we "own" it */
     ReplicationSlotAcquire(NameStr(*slotname), true);
 
-    startlsn = MyReplicationSlot->data.confirmed_flush;
+    if (OidIsValid(MyReplicationSlot->data.database))
+        startlsn = MyReplicationSlot->data.restart_lsn;
+    else
+        startlsn = MyReplicationSlot->data.confirmed_flush;
     if (moveto < startlsn)
     {
         ReplicationSlotRelease();
-- 
2.11.0


Re: Server Crash while executing pg_replication_slot_advance (second time)

From
Masahiko Sawada
Date:
On Sat, Feb 17, 2018 at 11:26 AM, Arseny Sher <a.sher@postgrespro.ru> wrote:
>
> Hello,
>
> I confirm this bug. The idea is that while usually we start decoding
> from safe data.restart_lsn point, here we don't care about consistent
> snapshots and rush into decoding right away from data.confirmed_flush
> (slotfunc.c:475). The latter points to the first page's header instead
> of valid record if we log SWITCH record and confirm it without any
> additional records (while doing this manually you'd better hurry up to
> outrun xl_running_xacts slipping through). This can be reproduced with a
> bit simpler sample:
>
> SELECT * FROM pg_create_logical_replication_slot(
>   'regression_slot1', 'test_decoding', true);
> select pg_switch_wal();
> -- ensure we are on clean segment and xl_running_xacts didn't slip yet
> select pg_current_wal_lsn();
>  pg_current_wal_lsn
> --------------------
>  0/2000000
> (1 row)
>
> -- set confirmed_flush_lsn to segment start and verify that
> select pg_replication_slot_advance('regression_slot1', '0/6000000');
> select slot_name, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn
>   from pg_replication_slots;
>     slot_name     | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn
> ------------------+------+--------------+-------------+---------------------
>  regression_slot1 |      |          557 | 0/15D0EE8   | 0/2000000
> (1 row)
>
> -- insert some wal to advance GetFlushRecPtr
> create table t (i int);
> -- now start decoding from start of segment, boom
> select pg_replication_slot_advance('regression_slot1', '0/6000000');
> server closed the connection unexpectedly
>         This probably means the server terminated abnormally
>         before or while processing the request.
> The connection to the server was lost. Attempting reset: Failed.
> : @:!>
> : @:!>
> : @:!> \q
>
>
> Simple way to fix that is to start decoding traditionally from
> restart_lsn which certainly points to valid record. Attached patch shows
> the idea. However, I am not sure this is a right decision: decoding from
> restart_lsn is slower since it lags behind confirmed_lsn; and we really
> don't care about consistent snapshots in this function. Well, it would
> be good if we assemble one on our way, serialize it and advance
> restart_lsn -- and this is AFAIU the main reason we ever decode
> something at all instead of just calling LogicalConfirmReceivedLocation
> -- but that's not the main goal of the function.
>
> Another approach is to notice pointer to page start and replace it
> with ptr to first record, but that doesn't sound elegantly.
>

Or I think we need something like XLogFindNextRecord facility before
reading WAL from startlsn to find a valid lsn. Note that it might wait
for new record up to bgwriter_delay time.

Regards,

--
Masahiko Sawada
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center


Hi,

There is an another similar issue where i am getting an error -

postgres=# select * from pg_replication_slots ;
    slot_name     |    plugin     | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn
------------------+---------------+-----------+--------+----------+-----------+--------+------------+------+--------------+-------------+---------------------
 regression_slot2 | test_decoding | logical   |  13223 | postgres | f         | f      |            |      |          569 | 0/2B000370  | 0/2B0003A8
(1 row)

postgres=#  SELECT * FROM pg_replication_slot_advance('regression_slot2',pg_switch_wal());
    slot_name     |  end_lsn  
------------------+------------
 regression_slot2 | 0/2B0003C0
(1 row)

postgres=#  SELECT * FROM pg_replication_slot_advance('regression_slot2',pg_switch_wal());
ERROR:  invalid record length at 0/2B0003C0: wanted 24, got 0
postgres=#

regards,

On 02/16/2018 04:04 PM, tushar wrote:

Hi,

Getting an another server crash against latest sources of v11  while executing pg_replication_slot_advance second time . This issue is also  reproducible  with the patch given at  <Link>

Test-Case scenario-

postgres=#  SELECT * FROM pg_create_logical_replication_slot('regression_slot1', 'test_decoding', true);
    slot_name     |    lsn   
------------------+-----------
 regression_slot1 | 0/4000258
(1 row)

postgres=# select * from pg_replication_slots;
    slot_name     |    plugin     | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn
------------------+---------------+-----------+--------+----------+-----------+--------+------------+------+--------------+-------------+---------------------
 regression_slot1 | test_decoding | logical   |  13220 | postgres | t         | t      |       8756 |      |          557 | 0/4000220   | 0/4000258
(1 row)

postgres=# select pg_switch_wal();
 pg_switch_wal
---------------
 0/4000270
(1 row)

postgres=# select pg_switch_wal();
 pg_switch_wal
---------------
 0/5000000
(1 row)

postgres=# SELECT end_lsn FROM pg_replication_slot_advance('regression_slot1', '0/5000000');
  end_lsn 
-----------
 0/5000000
(1 row)

postgres=# select * from pg_replication_slots;
    slot_name     |    plugin     | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn
------------------+---------------+-----------+--------+----------+-----------+--------+------------+------+--------------+-------------+---------------------
 regression_slot1 | test_decoding | logical   |  13220 | postgres | t         | t      |       8756 |      |          557 | 0/4000220   | 0/5000000
(1 row)

postgres=# select pg_switch_wal();
 pg_switch_wal
---------------
 0/5000078
(1 row)

postgres=# select pg_switch_wal();
 pg_switch_wal
---------------
 0/6000000
(1 row)

postgres=# SELECT end_lsn FROM pg_replication_slot_advance('regression_slot1', '0/6000000');
server closed the connection unexpectedly
    This probably means the server terminated abnormally
    before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.
!> \q

Stack trace -

Core was generated by `postgres: centos postgres [local] SELECT                      '.
Program terminated with signal 6, Aborted.
#0  0x0000003746e325e5 in raise () from /lib64/libc.so.6
Missing separate debuginfos, use: debuginfo-install glibc-2.12-1.192.el6.x86_64 keyutils-libs-1.4-5.el6.x86_64 krb5-libs-1.10.3-57.el6.x86_64 libcom_err-1.41.12-22.el6.x86_64 libselinux-2.0.94-7.el6.x86_64 openssl-1.0.1e-48.el6_8.4.x86_64 zlib-1.2.3-29.el6.x86_64
(gdb) bt
#0  0x0000003746e325e5 in raise () from /lib64/libc.so.6
#1  0x0000003746e33dc5 in abort () from /lib64/libc.so.6
#2  0x0000000000a11f8a in ExceptionalCondition (conditionName=0xad5f00 "!(((RecPtr) % 8192 >= (((uintptr_t) ((sizeof(XLogPageHeaderData))) + ((8) - 1)) & ~((uintptr_t) ((8) - 1)))))",
    errorType=0xad5eed "FailedAssertion", fileName=0xad5ee0 "xlogreader.c", lineNumber=243) at assert.c:54
#3  0x000000000055df1d in XLogReadRecord (state=0x2d859c0, RecPtr=83886080, errormsg=0x7ffd6a4359f8) at xlogreader.c:243
#4  0x00000000008516bc in pg_logical_replication_slot_advance (startlsn=83886080, moveto=100663296) at slotfuncs.c:370
#5  0x0000000000851a21 in pg_replication_slot_advance (fcinfo=0x7ffd6a435bb0) at slotfuncs.c:488
#6  0x00000000006defdc in ExecMakeTableFunctionResult (setexpr=0x2d5e260, econtext=0x2d5df58, argContext=0x2da97f0, expectedDesc=0x2d5f780, randomAccess=0 '\000') at execSRF.c:231
#7  0x00000000006f1782 in FunctionNext (node=0x2d5de40) at nodeFunctionscan.c:95
#8  0x00000000006de80d in ExecScanFetch (node=0x2d5de40, accessMtd=0x6f16cb <FunctionNext>, recheckMtd=0x6f1abd <FunctionRecheck>) at execScan.c:95
#9  0x00000000006de8ad in ExecScan (node=0x2d5de40, accessMtd=0x6f16cb <FunctionNext>, recheckMtd=0x6f1abd <FunctionRecheck>) at execScan.c:162
#10 0x00000000006f1b0a in ExecFunctionScan (pstate=0x2d5de40) at nodeFunctionscan.c:270
#11 0x00000000006dcd66 in ExecProcNodeFirst (node=0x2d5de40) at execProcnode.c:446
#12 0x00000000006d3c05 in ExecProcNode (node=0x2d5de40) at ../../../src/include/executor/executor.h:246
#13 0x00000000006d6559 in ExecutePlan (estate=0x2d5dc28, planstate=0x2d5de40, use_parallel_mode=0 '\000', operation=CMD_SELECT, sendTuples=1 '\001', numberTuples=0,
    direction=ForwardScanDirection, dest=0x2d63628, execute_once=1 '\001') at execMain.c:1721
#14 0x00000000006d41d7 in standard_ExecutorRun (queryDesc=0x2cce878, direction=ForwardScanDirection, count=0, execute_once=1 '\001') at execMain.c:361
#15 0x00000000006d3ff3 in ExecutorRun (queryDesc=0x2cce878, direction=ForwardScanDirection, count=0, execute_once=1 '\001') at execMain.c:304
#16 0x00000000008afeeb in PortalRunSelect (portal=0x2d0e838, forward=1 '\001', count=0, dest=0x2d63628) at pquery.c:932
#17 0x00000000008afb79 in PortalRun (portal=0x2d0e838, count=9223372036854775807, isTopLevel=1 '\001', run_once=1 '\001', dest=0x2d63628, altdest=0x2d63628,
    completionTag=0x7ffd6a436480 "") at pquery.c:773
#18 0x00000000008a9ba0 in exec_simple_query (query_string=0x2ca9438 "SELECT end_lsn FROM pg_replication_slot_advance('regression_slot1', '0/6000000');") at postgres.c:1120
#19 0x00000000008ade34 in PostgresMain (argc=1, argv=0x2cd4ec0, dbname=0x2cd4d20 "postgres", username=0x2ca5f58 "centos") at postgres.c:4144
#20 0x000000000080d11a in BackendRun (port=0x2cccd00) at postmaster.c:4412
#21 0x000000000080c88e in BackendStartup (port=0x2cccd00) at postmaster.c:4084
#22 0x0000000000808cbb in ServerLoop () at postmaster.c:1757
#23 0x0000000000808379 in PostmasterMain (argc=3, argv=0x2ca3eb0) at postmaster.c:1365
#24 0x000000000073d3d0 in main (argc=3, argv=0x2ca3eb0) at main.c:228
(gdb)

-- 
regards,tushar
EnterpriseDB  https://www.enterprisedb.com/
The Enterprise PostgreSQL Company


-- 
regards,tushar
EnterpriseDB  https://www.enterprisedb.com/
The Enterprise PostgreSQL Company