From 36062b7ae0c0ae83918dd467764d863d48c91e33 Mon Sep 17 00:00:00 2001 From: Hou Zhijie Date: Tue, 16 Jan 2024 14:02:32 +0800 Subject: [PATCH v63 6/6] Document the steps to check if the standby is ready for failover --- doc/src/sgml/high-availability.sgml | 9 ++ doc/src/sgml/logical-replication.sgml | 130 ++++++++++++++++++++++++++ doc/src/sgml/logicaldecoding.sgml | 33 ++++--- 3 files changed, 158 insertions(+), 14 deletions(-) diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml index 9dd52ff275..11f41aea2c 100644 --- a/doc/src/sgml/high-availability.sgml +++ b/doc/src/sgml/high-availability.sgml @@ -1479,6 +1479,15 @@ synchronous_standby_names = 'ANY 2 (s1, s2, s3)' Written administration procedures are advised. + + In one has opted for synchronization of logical slots as mentioned in + , + then before switching to the standby server, it is recommended to check + if the logical slots synchronized on the standby server are ready + for failover. This can be done by following the steps mentioned in + . + + To trigger failover of a log-shipping standby server, run pg_ctl promote or call pg_promote(). diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml index ec2130669e..924e4ea033 100644 --- a/doc/src/sgml/logical-replication.sgml +++ b/doc/src/sgml/logical-replication.sgml @@ -687,6 +687,136 @@ ALTER SUBSCRIPTION + + Logical Replication Failover + + + When the publisher server is the primary server of a streaming replication, + the logical slots on that primary server can be synchronized to the standby + server by specifying failover = true when creating + subscriptions for those publications. Enabling failover ensures a seamless + transition of those subscriptions after the standby is promoted. They can + continue subscribing to publications now on the new primary server without + any data loss. + + + + Because the slot synchronization logic copies asynchronously, it is + necessary to confirm that replication slots have been synced to the standby + server before the failover happens. Furthermore, to ensure a successful + failover, the standby server must not be lagging behind the subscriber. It + is highly recommended to use + standby_slot_names + to prevent the subscriber from consuming changes faster than the hot standby. + To confirm that the standby server is indeed ready for failover, follow + these 2 steps: + + + + + + Confirm that all the necessary logical replication slots have been synced to + the standby server. + + + + + Firstly, on the subscriber node, use the following SQL to identify + which slots should be synced to the standby that we plan to promote. + +test_sub=# SELECT + array_agg(slotname) AS slots + FROM + (( + SELECT r.srsubid AS subid, CONCAT('pg_' || srsubid || '_sync_' || srrelid || '_' || ctl.system_identifier) AS slotname + FROM pg_control_system() ctl, pg_subscription_rel r, pg_subscription s + WHERE r.srsubstate = 'f' AND s.oid = r.srsubid AND s.subfailover + ) UNION ( + SELECT s.oid AS subid, s.subslotname as slotname + FROM pg_subscription s + WHERE s.subfailover + )); + slots +------- + {sub1,sub2,sub3} +(1 row) + + + + + Next, check that the logical replication slots identified above exist on + the standby server and are ready for failover. + +test_standby=# SELECT slot_name, (synced AND NOT temporary AND conflict_reason IS NULL) AS failover_ready + FROM pg_replication_slots + WHERE slot_name IN ('sub1','sub2','sub3'); + slot_name | failover_ready +-------------+---------------- + sub1 | t + sub2 | t + sub3 | t +(3 rows) + + + + + + + + Confirm that the standby server is not lagging behind the subscribers. + This step can be skipped if + standby_slot_names + has been correctly configured. + + + + + Firstly, on the subscriber node check the last replayed WAL. + +test_sub=# SELECT + MAX(remote_lsn) AS remote_lsn_on_subscriber + FROM + (( + SELECT (CASE WHEN r.srsubstate = 'f' THEN pg_replication_origin_progress(CONCAT('pg_' || r.srsubid || '_' || r.srrelid), false) + WHEN r.srsubstate IN ('s', 'r') THEN r.srsublsn END) AS remote_lsn + FROM pg_subscription_rel r, pg_subscription s + WHERE r.srsubstate IN ('f', 's', 'r') AND s.oid = r.srsubid AND s.subfailover + ) UNION ( + SELECT pg_replication_origin_progress(CONCAT('pg_' || s.oid), false) AS remote_lsn + FROM pg_subscription s + WHERE s.subfailover + )); + remote_lsn_on_subscriber +-------------------------- + 0/3000388 + + + + + Next, on the standby server check that the last-received WAL location + is ahead of the replayed WAL location on the subscriber identified above. + If the above SQL result was NULL, it means the subscriber has not yet + replayed any WAL, so the standby server must be ahead of the + subscriber, and this step can be skipped. + +test_standby=# SELECT pg_last_wal_receive_lsn() >= '0/3000388'::pg_lsn AS failover_ready; + failover_ready +---------------- + t +(1 row) + + + + + + + + If the result (failover_ready) of both above steps is + true, existing subscriptions will be able to continue without data loss. + + + + Row Filters diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml index edb511c065..f2a0b5fa7b 100644 --- a/doc/src/sgml/logicaldecoding.sgml +++ b/doc/src/sgml/logicaldecoding.sgml @@ -346,9 +346,27 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU pg_log_standby_snapshot function on the primary. + + + Replication slots persist across crashes and know nothing about the state + of their consumer(s). They will prevent removal of required resources + even when there is no connection using them. This consumes storage + because neither required WAL nor required rows from the system catalogs + can be removed by VACUUM as long as they are required by a replication + slot. In extreme cases this could cause the database to shut down to prevent + transaction ID wraparound (see ). + So if a slot is no longer required it should be dropped. + + + + + + + Replication Slots Synchronization A logical replication slot on the primary can be synchronized to the hot - standby by enabling the failover option during slot creation and setting + standby by enabling the failover option during slot + creation and setting on the standby. For the synchronization to work, it is mandatory to have a physical replication slot between the primary and the standby, and hot_standby_feedback must @@ -380,19 +398,6 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU It is recommended that subscriptions are first disabled before promoting the standby and are enabled back after altering the connection string. - - - - Replication slots persist across crashes and know nothing about the state - of their consumer(s). They will prevent removal of required resources - even when there is no connection using them. This consumes storage - because neither required WAL nor required rows from the system catalogs - can be removed by VACUUM as long as they are required by a replication - slot. In extreme cases this could cause the database to shut down to prevent - transaction ID wraparound (see ). - So if a slot is no longer required it should be dropped. - - -- 2.34.1