Re: min_safe_lsn column in pg_replication_slots view - Mailing list pgsql-hackers

From Kyotaro Horiguchi
Subject Re: min_safe_lsn column in pg_replication_slots view
Date
Msg-id 20200701.103259.1263090824041890800.horikyota.ntt@gmail.com
Whole thread Raw
In response to Re: min_safe_lsn column in pg_replication_slots view  (Fujii Masao <masao.fujii@oss.nttdata.com>)
List pgsql-hackers
At Tue, 30 Jun 2020 23:23:30 +0900, Fujii Masao <masao.fujii@oss.nttdata.com> wrote in
> >> Can we consider an option to "Remove min_safe_lsn; document how a user
> >> can monitor the distance"?  We have a function to get current WAL
> >> insert location and other things required are available either via
> >> view or as guc variable values.  The reason I am thinking of this
> >> option is that it might be better to get some more feedback on what is
> >> the most appropriate value to display.  However, I am okay if we can
> >> reach a consensus on one of the above options.
> > Yes, that's an idea. But it might not be easy to calculate that
> > distance
> > manually by subtracting max_slot_wal_keep_size from the current LSN.
> > Because we've not supported -(pg_lsn, numeric) operator yet. I'm
> > proposing that operator, but it's for v14.
>
> Sorry this is not true. That distance can be calculated without those
> operators.
> For example,
>
> SELECT restart_lsn - pg_current_wal_lsn() + (SELECT setting::numeric *
> 1024 * 1024 FROM pg_settings WHERE name = 'max_slot_wal_keep_size')
> distance FROM pg_replication_slots;

It's an approximation with accuracy of segment size. The calculation
would be not that simple because of the unit of the calculation.  The
formula for the exact calculateion (ignoring wal_keep_segments) is:

distance = (seg_floor(restart_lsn) +
             seg_floor(max_slot_wal_keep_size) + 1) * wal_segment_size -
            current_lsn

where seg_floor is floor() by wal_segment_size.

regards.

> If the calculated distance is small or negative value, which means
> that
> we may lose some required WAL files. So in this case it's worth
> considering
> to increase max_slot_wal_keep_size.
>
> I still think it's better and more helpful to display something like
> that distance in pg_replication_slots rather than making each user
> calculate it...

Agreed.  The attached replaces min_safe_lsn with "distance".

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center
From 7e078c9ff8e0594ce8f4e95c7db84ea0a31e9775 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyoga.ntt@gmail.com>
Date: Tue, 30 Jun 2020 21:09:19 +0900
Subject: [PATCH v1] Replace min_safe_lsn with "distance" in
 pg_replication_slots

pg_replication_slot.min_safe_lsn, which shows the oldest LSN kept in
pg_wal, is doubtful in usability for monitoring. Change it to
distance, which shows how many bytes the server can advance before the
slot loses required segments.
---
 src/backend/access/transam/xlog.c         | 39 +++++++++++++++++++++++
 src/backend/catalog/system_views.sql      |  2 +-
 src/backend/replication/slotfuncs.c       | 19 +++++------
 src/include/access/xlog.h                 |  1 +
 src/include/catalog/pg_proc.dat           |  4 +--
 src/test/recovery/t/019_replslot_limit.pl | 20 ++++++------
 src/test/regress/expected/rules.out       |  4 +--
 7 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index fd93bcfaeb..1f27639912 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9570,6 +9570,45 @@ GetWALAvailability(XLogRecPtr targetLSN)
     return WALAVAIL_REMOVED;
 }
 
+/*
+ * Calculate how many bytes we can advance from currptr until the targetLSN is
+ * removed.
+ *
+ * Returns 0 if the distance is invalid.
+ */
+uint64
+DistanceToWALHorizon(XLogRecPtr targetLSN, XLogRecPtr currptr)
+{
+    XLogSegNo    targetSeg;
+    XLogSegNo    keepSegs;
+    XLogSegNo    failSeg;
+    XLogRecPtr    horizon;
+
+    XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
+    keepSegs = 0;
+
+    /* no limit if max_slot_wal_keep_size is invalid */
+    if (max_slot_wal_keep_size_mb < 0)
+        return 0;
+
+    /* How many segments slots can keep? */
+    keepSegs = ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
+
+    /* override by wal_keep_segments if needed */
+    if (wal_keep_segments > keepSegs)
+        keepSegs = wal_keep_segments;
+
+    /* calculate the LSN where targetLSN is lost when currpos reaches */
+    failSeg = targetSeg + keepSegs + 1;
+    XLogSegNoOffsetToRecPtr(failSeg, 0, wal_segment_size, horizon);
+
+    /* If currptr already beyond the horizon, return zero. */
+    if (currptr > horizon)
+        return 0;
+
+    /* return the distance from currptr to the horizon */
+    return horizon - currptr;
+}
 
 /*
  * Retreat *logSegNo to the last segment that we need to retain because of
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 5314e9348f..b9847a9f92 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -879,7 +879,7 @@ CREATE VIEW pg_replication_slots AS
             L.restart_lsn,
             L.confirmed_flush_lsn,
             L.wal_status,
-            L.min_safe_lsn
+            L.distance
     FROM pg_get_replication_slots() AS L
             LEFT JOIN pg_database D ON (L.datoid = D.oid);
 
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 88033a79b2..532b3c5826 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -242,6 +242,7 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
     Tuplestorestate *tupstore;
     MemoryContext per_query_ctx;
     MemoryContext oldcontext;
+    XLogRecPtr    currlsn;
     int            slotno;
 
     /* check to see if caller supports us returning a tuplestore */
@@ -274,6 +275,8 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 
     MemoryContextSwitchTo(oldcontext);
 
+    currlsn = GetXLogWriteRecPtr();
+
     LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
     for (slotno = 0; slotno < max_replication_slots; slotno++)
     {
@@ -282,7 +285,7 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
         Datum        values[PG_GET_REPLICATION_SLOTS_COLS];
         bool        nulls[PG_GET_REPLICATION_SLOTS_COLS];
         WALAvailability walstate;
-        XLogSegNo    last_removed_seg;
+        uint32        distance;
         int            i;
 
         if (!slot->in_use)
@@ -398,16 +401,10 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
                 break;
         }
 
-        if (max_slot_wal_keep_size_mb >= 0 &&
-            (walstate == WALAVAIL_RESERVED || walstate == WALAVAIL_EXTENDED) &&
-            ((last_removed_seg = XLogGetLastRemovedSegno()) != 0))
-        {
-            XLogRecPtr    min_safe_lsn;
-
-            XLogSegNoOffsetToRecPtr(last_removed_seg + 1, 0,
-                                    wal_segment_size, min_safe_lsn);
-            values[i++] = Int64GetDatum(min_safe_lsn);
-        }
+        distance =
+            DistanceToWALHorizon(slot_contents.data.restart_lsn, currlsn);
+        if (distance > 0)
+            values[i++] = Int64GetDatum(distance);
         else
             nulls[i++] = true;
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 77ac4e785f..1ec448c5d5 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -329,6 +329,7 @@ extern void InitXLOGAccess(void);
 extern void CreateCheckPoint(int flags);
 extern bool CreateRestartPoint(int flags);
 extern WALAvailability GetWALAvailability(XLogRecPtr targetLSN);
+extern uint64 DistanceToWALHorizon(XLogRecPtr targetLSN, XLogRecPtr currptr);
 extern XLogRecPtr CalculateMaxmumSafeLSN(void);
 extern void XLogPutNextOid(Oid nextOid);
 extern XLogRecPtr XLogRestorePoint(const char *rpName);
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 61f2c2f5b4..199fd994bd 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10063,9 +10063,9 @@
   proname => 'pg_get_replication_slots', prorows => '10', proisstrict => 'f',
   proretset => 't', provolatile => 's', prorettype => 'record',
   proargtypes => '',
-  proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,pg_lsn}',
+  proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8}',
   proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames =>
'{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,min_safe_lsn}',
+  proargnames =>
'{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,distance}',
   prosrc => 'pg_get_replication_slots' },
 { oid => '3786', descr => 'set up a logical replication slot',
   proname => 'pg_create_logical_replication_slot', provolatile => 'v',
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index 7d22ae5720..1c76d2d9e9 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -28,7 +28,7 @@ $node_master->safe_psql('postgres',
 
 # The slot state and remain should be null before the first connection
 my $result = $node_master->safe_psql('postgres',
-    "SELECT restart_lsn IS NULL, wal_status is NULL, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name =
'rep1'"
+    "SELECT restart_lsn IS NULL, wal_status is NULL, distance is NULL FROM pg_replication_slots WHERE slot_name =
'rep1'"
 );
 is($result, "t|t|t", 'check the state of non-reserved slot is "unknown"');
 
@@ -52,9 +52,9 @@ $node_master->wait_for_catchup($node_standby, 'replay', $start_lsn);
 # Stop standby
 $node_standby->stop;
 
-# Preparation done, the slot is the state "normal" now
+# Preparation done, the slot is the state "reserved" now
 $result = $node_master->safe_psql('postgres',
-    "SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
+    "SELECT wal_status, distance is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "reserved|t", 'check the catching-up state');
 
@@ -64,7 +64,7 @@ $node_master->safe_psql('postgres', "CHECKPOINT;");
 
 # The slot is always "safe" when fitting max_wal_size
 $result = $node_master->safe_psql('postgres',
-    "SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
+    "SELECT wal_status, distance is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "reserved|t",
     'check that it is safe if WAL fits in max_wal_size');
@@ -74,7 +74,7 @@ $node_master->safe_psql('postgres', "CHECKPOINT;");
 
 # The slot is always "safe" when max_slot_wal_keep_size is not set
 $result = $node_master->safe_psql('postgres',
-    "SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
+    "SELECT wal_status, distance is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "reserved|t", 'check that slot is working');
 
@@ -94,9 +94,7 @@ max_slot_wal_keep_size = ${max_slot_wal_keep_size_mb}MB
 ));
 $node_master->reload;
 
-# The slot is in safe state. The distance from the min_safe_lsn should
-# be as almost (max_slot_wal_keep_size - 1) times large as the segment
-# size
+# The slot is in safe state.
 
 $result = $node_master->safe_psql('postgres',
     "SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
@@ -110,7 +108,7 @@ $node_master->safe_psql('postgres', "CHECKPOINT;");
 $result = $node_master->safe_psql('postgres',
     "SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
 is($result, "reserved",
-    'check that min_safe_lsn gets close to the current LSN');
+    'check that distance gets close to the current LSN');
 
 # The standby can reconnect to master
 $node_standby->start;
@@ -154,7 +152,7 @@ advance_wal($node_master, 1);
 
 # Slot gets into 'unreserved' state
 $result = $node_master->safe_psql('postgres',
-    "SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
+    "SELECT wal_status, distance is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "unreserved|t",
     'check that the slot state changes to "unreserved"');
@@ -186,7 +184,7 @@ ok( find_in_log(
 
 # This slot should be broken
 $result = $node_master->safe_psql('postgres',
-    "SELECT slot_name, active, restart_lsn IS NULL, wal_status, min_safe_lsn FROM pg_replication_slots WHERE slot_name
='rep1'"
 
+    "SELECT slot_name, active, restart_lsn IS NULL, wal_status, distance FROM pg_replication_slots WHERE slot_name =
'rep1'"
 );
 is($result, "rep1|f|t|lost|",
     'check that the slot became inactive and the state "lost" persists');
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index b813e32215..392eab12dd 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1464,8 +1464,8 @@ pg_replication_slots| SELECT l.slot_name,
     l.restart_lsn,
     l.confirmed_flush_lsn,
     l.wal_status,
-    l.min_safe_lsn
-   FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin,
catalog_xmin,restart_lsn, confirmed_flush_lsn, wal_status, min_safe_lsn)
 
+    l.distance
+   FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin,
catalog_xmin,restart_lsn, confirmed_flush_lsn, wal_status, distance)
 
      LEFT JOIN pg_database d ON ((l.datoid = d.oid)));
 pg_roles| SELECT pg_authid.rolname,
     pg_authid.rolsuper,
-- 
2.18.4


pgsql-hackers by date:

Previous
From: Bruce Momjian
Date:
Subject: Re: Remove Deprecated Exclusive Backup Mode
Next
From: michael@paquier.xyz
Date:
Subject: Re: [PATCH] Better cleanup in TLS tests for -13beta2