From d45b516bea3f80c5e9bf1fcd9fd2004de52d872e Mon Sep 17 00:00:00 2001 From: Hayato Kuroda Date: Mon, 16 Feb 2026 11:58:11 +0900 Subject: [PATCH v4] Stabilize 009_twophase.pl 009_twophase.pl does switchover several times, but sometimes the old primary could not follow the new primary, with the log like below: ``` LOG: fetching timeline history file for timeline 2 from primary server LOG: started streaming WAL from primary at 0/03000000 on timeline 1 LOG: replication terminated by primary server DETAIL: End of WAL reached on timeline 1 at 0/03022070. LOG: new timeline 2 forked off current database system timeline 1 before current recovery point 0/030220B8 LOG: restarted WAL streaming at 0/03000000 on timeline 1 LOG: replication terminated by primary server ``` This issue could occur for two reasons. 1) An old primary shuts down before all changes are replicated. 2) A background writer on the old primary generated the RUNNING_XACTS record, and the node shut down before sending it. This commit addresses both. Regarding the first issue, wait_for_replay_catchup() has been added to ensure that all changes are replicated. As for the second issue, the injection_points extension is used to suppress the generation of RUNNING_XACTS records. For now, this test can run without injection_points, but random failures can still occur in such environments. Author: Hayato Kuroda Reviewed-by: Alexander Lakhin Reviewed-by: Michael Paquier --- src/test/recovery/t/009_twophase.pl | 54 +++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/src/test/recovery/t/009_twophase.pl b/src/test/recovery/t/009_twophase.pl index 879e493b5b8..6ec72370c70 100644 --- a/src/test/recovery/t/009_twophase.pl +++ b/src/test/recovery/t/009_twophase.pl @@ -12,6 +12,16 @@ use Test::More; my $psql_out = ''; my $psql_rc = ''; +# This test needs an injection point to avoid generating xl_running_xacts. +# Because this test does the switch over several times, and if the record is +# generated on the old primary between the switchover, it might be put only on +# the old timeline of the WAL. Hence, the old primary cannot start following +# the new primary in this case. +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + sub configure_and_reload { local $Test::Builder::Level = $Test::Builder::Level + 1; @@ -30,24 +40,35 @@ sub configure_and_reload # Set up two nodes, which will alternately be primary and replication standby. -# Setup london node +# Setup london node. +# +# Here, we avoid both checkpoint and autovacuum during the test. Otherwise, WAL +# records can be generated between the switchover, and the old primary cannot +# start following the new primary. my $node_london = PostgreSQL::Test::Cluster->new("london"); $node_london->init(allows_streaming => 1); $node_london->append_conf( 'postgresql.conf', qq( max_prepared_transactions = 10 log_checkpoints = true + autovacuum = off + checkpoint_timeout = 1h )); $node_london->start; $node_london->backup('london_backup'); -# Setup paris node +# Setup paris node. +# +# Avoid both checkpoint and autovacuum to stabilize the test. See comments atop +# setting upt the london node. my $node_paris = PostgreSQL::Test::Cluster->new('paris'); $node_paris->init_from_backup($node_london, 'london_backup', has_streaming => 1); $node_paris->append_conf( 'postgresql.conf', qq( subtransaction_buffers = 32 + autovacuum = off + checkpoint_timeout = 1h )); $node_paris->start; @@ -63,6 +84,16 @@ my $cur_primary_name = $cur_primary->name; # Create table we'll use in the test transactions $cur_primary->psql('postgres', "CREATE TABLE t_009_tbl (id int, msg text)"); +# Check if the extension injection_points is available, as it may be +# possible that this script is run with installcheck, where the module +# would not be installed by default. +if (!$cur_primary->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} + +$cur_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); + ############################################################################### # Check that we can commit and abort transaction after soft restart. # Here checkpoint happens before shutdown and no WAL replay will occur at next @@ -158,6 +189,12 @@ $cur_primary->psql( COMMIT PREPARED 'xact_009_6';"); $cur_primary->teardown_node; $cur_primary->start; + +# Attach an injection point to avoid seeing xl_running_xacts records. See +# comments at the beginning of the file. +$cur_primary->safe_psql('postgres', + "SELECT injection_points_attach('skip-log-running-xacts', 'error');"); + $psql_rc = $cur_primary->psql( 'postgres', " BEGIN; @@ -222,6 +259,7 @@ $cur_primary->psql( SAVEPOINT s1; INSERT INTO t_009_tbl VALUES (22, 'issued to ${cur_primary_name}'); PREPARE TRANSACTION 'xact_009_10';"); +$cur_primary->wait_for_replay_catchup($cur_standby); $cur_primary->teardown_node; $cur_standby->promote; @@ -230,6 +268,11 @@ note "Now paris is primary and london is standby"; ($cur_primary, $cur_standby) = ($node_paris, $node_london); $cur_primary_name = $cur_primary->name; +# Attach an injection point to avoid seeing xl_running_xacts records. See +# comments at the beginning of the file. +$cur_primary->safe_psql('postgres', + "SELECT injection_points_attach('skip-log-running-xacts', 'error');"); + # because london is not running at this point, we can't use syncrep commit # on this command $psql_rc = $cur_primary->psql('postgres', @@ -254,6 +297,7 @@ $cur_primary->psql( SAVEPOINT s1; INSERT INTO t_009_tbl VALUES (24, 'issued to ${cur_primary_name}'); PREPARE TRANSACTION 'xact_009_11';"); +$cur_primary->wait_for_replay_catchup($cur_standby); $cur_primary->stop; $cur_standby->restart; $cur_standby->promote; @@ -263,6 +307,11 @@ note "Now london is primary and paris is standby"; ($cur_primary, $cur_standby) = ($node_london, $node_paris); $cur_primary_name = $cur_primary->name; +# Attach an injection point to avoid seeing xl_running_xacts records. See +# comments at the beginning of the file. +$cur_primary->safe_psql('postgres', + "SELECT injection_points_attach('skip-log-running-xacts', 'error');"); + $cur_primary->psql( 'postgres', "SELECT count(*) FROM pg_prepared_xacts", @@ -289,6 +338,7 @@ $cur_primary->psql( INSERT INTO t_009_tbl VALUES (26, 'issued to ${cur_primary_name}'); PREPARE TRANSACTION 'xact_009_12'; "); +$cur_primary->wait_for_replay_catchup($cur_standby); $cur_primary->stop; $cur_standby->teardown_node; $cur_standby->start; -- 2.47.3