From 7c0715ce49bdf0407de0de39b8b3e43466077744 Mon Sep 17 00:00:00 2001 From: Shveta Malik Date: Mon, 26 May 2025 11:39:17 +0530 Subject: [PATCH v4] Improve log messages and docs for slotsync --- doc/src/sgml/func.sgml | 6 ++- doc/src/sgml/logicaldecoding.sgml | 54 ++++++++++++++++++++-- src/backend/replication/logical/slotsync.c | 6 +-- 3 files changed, 57 insertions(+), 9 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index c67688cbf5f..8d7d9a2f3e8 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -29698,7 +29698,7 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset - + pg_logical_slot_get_binary_changes @@ -29970,7 +29970,9 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset standby server. Temporary synced slots, if any, cannot be used for logical decoding and must be dropped after promotion. See for details. - Note that this function cannot be executed if + Note that this function is primarily intended for testing and + debugging purposes and should be used with caution. Additionaly, + this function cannot be executed if sync_replication_slots is enabled and the slotsync worker is already running to perform the synchronization of slots. diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml index dd9e83b08ea..05763b273b8 100644 --- a/doc/src/sgml/logicaldecoding.sgml +++ b/doc/src/sgml/logicaldecoding.sgml @@ -370,10 +370,10 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU pg_create_logical_replication_slot, or by using the failover option of - CREATE SUBSCRIPTION during slot creation, and then calling - - pg_sync_replication_slots - on the standby. By setting + CREATE SUBSCRIPTION during slot creation. + Additionally, enabling + sync_replication_slots on the standby + is required. By enabling sync_replication_slots on the standby, the failover slots can be synchronized periodically in the slotsync worker. For the synchronization to work, it is mandatory to @@ -398,6 +398,52 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU receiving the WAL up to the latest flushed position on the primary server. + + Apart from enabling + sync_replication_slots to synchronize slots + periodically, failover slots can be manually synchronized by invoking + + pg_sync_replication_slots on the standby. + However, this function is primarily intended for testing and debugging + purposes and should be used with caution. The recommended approach to + synchronize slots is by enabling + sync_replication_slots on the standby, as it + ensures continuous and automatic synchronization of replication slots, + facilitating seamless failover and high availability. + + + + When slot-synchronization setup is done as recommended, and + slot-synchronization is performed the very first time either automatically + or by + pg_sync_replication_slots, + then for the synchronized slot to be created and persisted on the standby, + one condition must be met. The logical replication slot on the primary + must reach a state where the WALs and system catalog rows retained by + the slot are also present on the corresponding standby server. This is + needed to prevent any data loss and to allow logical replication to continue + seamlessly through the synchronized slot if needed after promotion. + If the WALs and system catalog rows retained by the slot on the primary have + already been purged from the standby server, and synchronization is attempted + for the first time, then to prevent the data loss as explained, persistence + and synchronization of newly created slot will be skipped, and the following + log message may appear on standby. + + LOG: could not synchronize replication slot "failover_slot" + DETAIL: Synchronization could lead to data loss as the remote slot needs WAL at LSN 0/3003F28 and catalog xmin 754, but the standby has LSN 0/3003F28 and catalog xmin 756 + + If the logical replication slot is actively consumed by a consumer, no further + manual action is needed by the user, as the slot on primary will be advanced + automatically, and synchronization will proceed in the next cycle. However, + if no logical replication consumer is set up yet, to advance the slot, it + is recommended to manually run the + pg_logical_slot_get_changes or + + pg_logical_slot_get_binary_changes on the primary + slot and allow synchronization to proceed. + + + The ability to resume logical replication after failover depends upon the pg_replication_slots.synced diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 656e66e0ae0..f1dcbebfa1a 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -211,9 +211,9 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, * impact the users, so we used DEBUG1 level to log the message. */ ereport(slot->data.persistency == RS_TEMPORARY ? LOG : DEBUG1, - errmsg("could not synchronize replication slot \"%s\" because remote slot precedes local slot", + errmsg("could not synchronize replication slot \"%s\"", remote_slot->name), - errdetail("The remote slot has LSN %X/%X and catalog xmin %u, but the local slot has LSN %X/%X and catalog xmin %u.", + errdetail("Synchronization could lead to data loss as the remote slot needs WAL at LSN %X/%X and catalog xmin %u, but the standby has LSN %X/%X and catalog xmin %u.", LSN_FORMAT_ARGS(remote_slot->restart_lsn), remote_slot->catalog_xmin, LSN_FORMAT_ARGS(slot->data.restart_lsn), @@ -593,7 +593,7 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid) { ereport(LOG, errmsg("could not synchronize replication slot \"%s\"", remote_slot->name), - errdetail("Logical decoding could not find consistent point from local slot's LSN %X/%X.", + errdetail("Synchronization could lead to data loss as standby could not build a consistent snapshot to decode WALs at LSN %X/%X.", LSN_FORMAT_ARGS(slot->data.restart_lsn))); return false; -- 2.34.1