From 4f8e82845a03015f5870933e7712fde5a10ebdfd Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy Date: Tue, 20 Feb 2024 05:44:19 +0000 Subject: [PATCH v5 3/4] Track inactive replication slot information Currently postgres doesn't track metrics like the time at which the slot became inactive, and the total number of times the slot became inactive in its lifetime. This commit adds two new metrics last_inactive_at of type timestamptz and inactive_count of type numeric to ReplicationSlotPersistentData. Whenever a slot becomes inactive, the current timestamp and inactive count are persisted to disk. These metrics are useful in the following ways: - To improve replication slot monitoring tools. For instance, one can build a monitoring tool that signals a) when replication slots is lying inactive for a day or so using last_inactive_at metric, b) when a replication slot is becoming inactive too frequently using last_inactive_at metric. - To implement timeout-based inactive replication slot management capability in postgres. Increases SLOT_VERSION due to the added two new metrics. --- doc/src/sgml/system-views.sgml | 20 +++++++++++++ src/backend/catalog/system_views.sql | 4 ++- src/backend/replication/slot.c | 43 ++++++++++++++++++++++------ src/backend/replication/slotfuncs.c | 15 +++++++++- src/include/catalog/pg_proc.dat | 6 ++-- src/include/replication/slot.h | 6 ++++ src/test/regress/expected/rules.out | 6 ++-- 7 files changed, 84 insertions(+), 16 deletions(-) diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index cce88c14bb..0dfd472b02 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -2771,6 +2771,26 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx ID of role + + + + last_inactive_at timestamptz + + + The time at which the slot became inactive. + NULL if the slot is currently actively being + used. + + + + + + inactive_count numeric + + + The total number of times the slot became inactive in its lifetime. + + diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index c39f0d73d3..a5a78a9910 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1026,7 +1026,9 @@ CREATE VIEW pg_replication_slots AS L.conflict_reason, L.failover, L.synced, - L.invalidation_reason + L.invalidation_reason, + L.last_inactive_at, + L.inactive_count FROM pg_get_replication_slots() AS L LEFT JOIN pg_database D ON (L.datoid = D.oid); diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 67d6bd849e..ce51c6d909 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -91,7 +91,7 @@ typedef struct ReplicationSlotOnDisk sizeof(ReplicationSlotOnDisk) - ReplicationSlotOnDiskConstantSize #define SLOT_MAGIC 0x1051CA1 /* format identifier */ -#define SLOT_VERSION 5 /* version for new files */ +#define SLOT_VERSION 6 /* version for new files */ /* Control array for replication slot management */ ReplicationSlotCtlData *ReplicationSlotCtl = NULL; @@ -346,6 +346,8 @@ ReplicationSlotCreate(const char *name, bool db_specific, slot->data.two_phase_at = InvalidXLogRecPtr; slot->data.failover = failover; slot->data.synced = synced; + slot->data.last_inactive_at = 0; + slot->data.inactive_count = 0; /* and then data only present in shared memory */ slot->just_dirtied = false; @@ -572,6 +574,17 @@ retry: if (am_walsender) { + if (s->data.persistency == RS_PERSISTENT) + { + SpinLockAcquire(&s->mutex); + s->data.last_inactive_at = 0; + SpinLockRelease(&s->mutex); + + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + } + ereport(log_replication_commands ? LOG : DEBUG1, SlotIsLogical(s) ? errmsg("acquired logical replication slot \"%s\"", @@ -639,16 +652,20 @@ ReplicationSlotRelease(void) ConditionVariableBroadcast(&slot->active_cv); } - MyReplicationSlot = NULL; - - /* might not have been set when we've been a plain slot */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - MyProc->statusFlags &= ~PROC_IN_LOGICAL_DECODING; - ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; - LWLockRelease(ProcArrayLock); - if (am_walsender) { + if (slot->data.persistency == RS_PERSISTENT) + { + SpinLockAcquire(&slot->mutex); + slot->data.last_inactive_at = GetCurrentTimestamp(); + slot->data.inactive_count++; + SpinLockRelease(&slot->mutex); + + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + } + ereport(log_replication_commands ? LOG : DEBUG1, is_logical ? errmsg("released logical replication slot \"%s\"", @@ -658,6 +675,14 @@ ReplicationSlotRelease(void) pfree(slotname); } + + MyReplicationSlot = NULL; + + /* might not have been set when we've been a plain slot */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + MyProc->statusFlags &= ~PROC_IN_LOGICAL_DECODING; + ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; + LWLockRelease(ProcArrayLock); } /* diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 7c1145bb75..6a12db27fd 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -239,10 +239,11 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS) Datum pg_get_replication_slots(PG_FUNCTION_ARGS) { -#define PG_GET_REPLICATION_SLOTS_COLS 18 +#define PG_GET_REPLICATION_SLOTS_COLS 20 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; XLogRecPtr currlsn; int slotno; + char buf[256]; /* * We don't require any special permission to see this function's data @@ -464,6 +465,18 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) break; } + if (slot_contents.data.last_inactive_at > 0) + values[i++] = TimestampTzGetDatum(slot_contents.data.last_inactive_at); + else + nulls[i++] = true; + + /* Convert to numeric. */ + snprintf(buf, sizeof buf, UINT64_FORMAT, slot_contents.data.inactive_count); + values[i++] = DirectFunctionCall3(numeric_in, + CStringGetDatum(buf), + ObjectIdGetDatum(0), + Int32GetDatum(-1)); + Assert(i == PG_GET_REPLICATION_SLOTS_COLS); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index a6bfc36426..3d4ace624e 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -11127,9 +11127,9 @@ proname => 'pg_get_replication_slots', prorows => '10', proisstrict => 'f', proretset => 't', provolatile => 's', prorettype => 'record', proargtypes => '', - proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,text,bool,bool,text}', - proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', - proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,conflict_reason,failover,synced,invalidation_reason}', + proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,text,bool,bool,text,timestamptz,numeric}', + proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,conflict_reason,failover,synced,invalidation_reason,last_inactive_at,inactive_count}', prosrc => 'pg_get_replication_slots' }, { oid => '3786', descr => 'set up a logical replication slot', proname => 'pg_create_logical_replication_slot', provolatile => 'v', diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h index 8216c35481..87a56aa28a 100644 --- a/src/include/replication/slot.h +++ b/src/include/replication/slot.h @@ -133,6 +133,12 @@ typedef struct ReplicationSlotPersistentData * for logical slots on the primary server. */ bool failover; + + /* When did this slot become inactive last time? */ + TimestampTz last_inactive_at; + + /* How many times the slot has been inactive? */ + uint64 inactive_count; } ReplicationSlotPersistentData; /* diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 0646874236..35fcf9d3d0 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1476,8 +1476,10 @@ pg_replication_slots| SELECT l.slot_name, l.conflict_reason, l.failover, l.synced, - l.invalidation_reason - FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, safe_wal_size, two_phase, conflict_reason, failover, synced, invalidation_reason) + l.invalidation_reason, + l.last_inactive_at, + l.inactive_count + FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, safe_wal_size, two_phase, conflict_reason, failover, synced, invalidation_reason, last_inactive_at, inactive_count) LEFT JOIN pg_database d ON ((l.datoid = d.oid))); pg_roles| SELECT pg_authid.rolname, pg_authid.rolsuper, -- 2.34.1