Re: Isn't wait_for_catchup slightly broken? - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Isn't wait_for_catchup slightly broken?
Date
Msg-id 3612314.1642287482@sss.pgh.pa.us
Whole thread Raw
In response to Isn't wait_for_catchup slightly broken?  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: Isn't wait_for_catchup slightly broken?
List pgsql-hackers
I wrote:
> Another thing that is bothering me a bit is that a number of the
> callers use $node->lsn('insert') as the target.  This also seems
> rather dubious, because that could be ahead of what's been written
> out.  These callers are just taking it on faith that something will
> eventually cause that extra WAL to get written out (and become
> available to the standby).  Again, that seems to make the test
> slower than it need be, with a worst-case scenario being that it
> eventually times out.  Admittedly this is unlikely to be a big
> problem unless some background op issues an abortive transaction
> at just the wrong time.  Nonetheless, I wonder if we shouldn't
> standardize on "thou shalt use the write position", because I
> don't think the other alternatives have anything to recommend them.

Here's a version that makes sure that callers specify a write position not
an insert position.  I also simplified the callers wherever it turned
out that they could just use the default parameters.

            regards, tom lane

diff --git a/src/bin/pg_basebackup/t/020_pg_receivewal.pl b/src/bin/pg_basebackup/t/020_pg_receivewal.pl
index b616c0ccf1..98dbdab595 100644
--- a/src/bin/pg_basebackup/t/020_pg_receivewal.pl
+++ b/src/bin/pg_basebackup/t/020_pg_receivewal.pl
@@ -290,7 +290,7 @@ $standby->psql(
     "CREATE_REPLICATION_SLOT $archive_slot PHYSICAL (RESERVE_WAL)",
     replication => 1);
 # Wait for standby catchup
-$primary->wait_for_catchup($standby, 'replay', $primary->lsn('write'));
+$primary->wait_for_catchup($standby);
 # Get a walfilename from before the promotion to make sure it is archived
 # after promotion
 my $standby_slot = $standby->slot($archive_slot);
diff --git a/src/bin/pg_rewind/t/007_standby_source.pl b/src/bin/pg_rewind/t/007_standby_source.pl
index 62254344f6..239b510c1a 100644
--- a/src/bin/pg_rewind/t/007_standby_source.pl
+++ b/src/bin/pg_rewind/t/007_standby_source.pl
@@ -74,7 +74,7 @@ $node_a->safe_psql('postgres',
     "INSERT INTO tbl1 values ('in A, before promotion')");
 $node_a->safe_psql('postgres', 'CHECKPOINT');

-my $lsn = $node_a->lsn('insert');
+my $lsn = $node_a->lsn('write');
 $node_a->wait_for_catchup('node_b', 'write', $lsn);
 $node_b->wait_for_catchup('node_c', 'write', $lsn);

@@ -93,8 +93,7 @@ $node_a->safe_psql('postgres',
     "INSERT INTO tbl1 VALUES ('in A, after C was promoted')");

 # make sure it's replicated to B before we continue
-$lsn = $node_a->lsn('insert');
-$node_a->wait_for_catchup('node_b', 'replay', $lsn);
+$node_a->wait_for_catchup('node_b');

 # Also insert a new row in the standby, which won't be present in the
 # old primary.
@@ -161,7 +160,7 @@ in A, after C was promoted
 $node_a->safe_psql('postgres',
     "INSERT INTO tbl1 values ('in A, after rewind')");

-$lsn = $node_a->lsn('insert');
+$lsn = $node_a->lsn('write');
 $node_b->wait_for_catchup('node_c', 'replay', $lsn);

 check_query(
diff --git a/src/bin/pg_rewind/t/008_min_recovery_point.pl b/src/bin/pg_rewind/t/008_min_recovery_point.pl
index 9ad7e6b62b..8240229230 100644
--- a/src/bin/pg_rewind/t/008_min_recovery_point.pl
+++ b/src/bin/pg_rewind/t/008_min_recovery_point.pl
@@ -69,8 +69,7 @@ $node_3->init_from_backup($node_1, $backup_name, has_streaming => 1);
 $node_3->start;

 # Wait until node 3 has connected and caught up
-my $lsn = $node_1->lsn('insert');
-$node_1->wait_for_catchup('node_3', 'replay', $lsn);
+$node_1->wait_for_catchup('node_3');

 #
 # Swap the roles of node_1 and node_3, so that node_1 follows node_3.
@@ -106,8 +105,7 @@ $node_2->restart();
 #

 # make sure node_1 is full caught up with node_3 first
-$lsn = $node_3->lsn('insert');
-$node_3->wait_for_catchup('node_1', 'replay', $lsn);
+$node_3->wait_for_catchup('node_1');

 $node_1->promote;
 # Force a checkpoint after promotion, like earlier.
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index e18f27276c..eceb8b2d8b 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -2509,8 +2509,11 @@ poll_query_until timeout.

 Requires that the 'postgres' db exists and is accessible.

-target_lsn may be any arbitrary lsn, but is typically $primary_node->lsn('insert').
-If omitted, pg_current_wal_lsn() is used.
+The default value of target_lsn is $node->lsn('write'), which ensures
+that the standby has caught up to what has been committed on the primary.
+If you pass an explicit value of target_lsn, it should almost always be
+the primary's write LSN; so this parameter is seldom needed except when
+querying some intermediate replication node rather than the primary.

 This is not a test. It die()s on failure.

@@ -2531,23 +2534,18 @@ sub wait_for_catchup
     {
         $standby_name = $standby_name->name;
     }
-    my $lsn_expr;
-    if (defined($target_lsn))
+    if (!defined($target_lsn))
     {
-        $lsn_expr = "'$target_lsn'";
-    }
-    else
-    {
-        $lsn_expr = 'pg_current_wal_lsn()';
+        $target_lsn = $self->lsn('write');
     }
     print "Waiting for replication conn "
       . $standby_name . "'s "
       . $mode
       . "_lsn to pass "
-      . $lsn_expr . " on "
+      . $target_lsn . " on "
       . $self->name . "\n";
     my $query =
-      qq[SELECT $lsn_expr <= ${mode}_lsn AND state = 'streaming' FROM pg_catalog.pg_stat_replication WHERE
application_name= '$standby_name';]; 
+      qq[SELECT '$target_lsn' <= ${mode}_lsn AND state = 'streaming' FROM pg_catalog.pg_stat_replication WHERE
application_name= '$standby_name';]; 
     $self->poll_query_until('postgres', $query)
       or croak "timed out waiting for catchup";
     print "done\n";
diff --git a/src/test/recovery/t/001_stream_rep.pl b/src/test/recovery/t/001_stream_rep.pl
index fb8f66878a..ca760c7210 100644
--- a/src/test/recovery/t/001_stream_rep.pl
+++ b/src/test/recovery/t/001_stream_rep.pl
@@ -47,10 +47,9 @@ $node_primary->safe_psql('postgres',
     "CREATE TABLE tab_int AS SELECT generate_series(1,1002) AS a");

 # Wait for standbys to catch up
-$node_primary->wait_for_catchup($node_standby_1, 'replay',
-    $node_primary->lsn('insert'));
-$node_standby_1->wait_for_catchup($node_standby_2, 'replay',
-    $node_standby_1->lsn('replay'));
+my $primary_lsn = $node_primary->lsn('write');
+$node_primary->wait_for_catchup($node_standby_1, 'replay', $primary_lsn);
+$node_standby_1->wait_for_catchup($node_standby_2, 'replay', $primary_lsn);

 my $result =
   $node_standby_1->safe_psql('postgres', "SELECT count(*) FROM tab_int");
@@ -67,10 +66,9 @@ $node_primary->safe_psql('postgres',
     "CREATE SEQUENCE seq1; SELECT nextval('seq1')");

 # Wait for standbys to catch up
-$node_primary->wait_for_catchup($node_standby_1, 'replay',
-    $node_primary->lsn('insert'));
-$node_standby_1->wait_for_catchup($node_standby_2, 'replay',
-    $node_standby_1->lsn('replay'));
+$primary_lsn = $node_primary->lsn('write');
+$node_primary->wait_for_catchup($node_standby_1, 'replay', $primary_lsn);
+$node_standby_1->wait_for_catchup($node_standby_2, 'replay', $primary_lsn);

 $result = $node_standby_1->safe_psql('postgres', "SELECT * FROM seq1");
 print "standby 1: $result\n";
@@ -374,10 +372,10 @@ sub replay_check
     my $newval = $node_primary->safe_psql('postgres',
         'INSERT INTO replayed(val) SELECT coalesce(max(val),0) + 1 AS newval FROM replayed RETURNING val'
     );
-    $node_primary->wait_for_catchup($node_standby_1, 'replay',
-        $node_primary->lsn('insert'));
-    $node_standby_1->wait_for_catchup($node_standby_2, 'replay',
-        $node_standby_1->lsn('replay'));
+    my $primary_lsn = $node_primary->lsn('write');
+    $node_primary->wait_for_catchup($node_standby_1, 'replay', $primary_lsn);
+    $node_standby_1->wait_for_catchup($node_standby_2, 'replay', $primary_lsn);
+
     $node_standby_1->safe_psql('postgres',
         qq[SELECT 1 FROM replayed WHERE val = $newval])
       or die "standby_1 didn't replay primary value $newval";
@@ -481,8 +479,7 @@ $node_standby_1->stop;
 my $newval = $node_primary->safe_psql('postgres',
     'INSERT INTO replayed(val) SELECT coalesce(max(val),0) + 1 AS newval FROM replayed RETURNING val'
 );
-$node_primary->wait_for_catchup($node_standby_2, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby_2);
 my $is_replayed = $node_standby_2->safe_psql('postgres',
     qq[SELECT 1 FROM replayed WHERE val = $newval]);
 is($is_replayed, qq(1), "standby_2 didn't replay primary value $newval");
diff --git a/src/test/recovery/t/004_timeline_switch.pl b/src/test/recovery/t/004_timeline_switch.pl
index 333d37b0a4..05b7931668 100644
--- a/src/test/recovery/t/004_timeline_switch.pl
+++ b/src/test/recovery/t/004_timeline_switch.pl
@@ -38,8 +38,7 @@ $node_primary->safe_psql('postgres',
     "CREATE TABLE tab_int AS SELECT generate_series(1,1000) AS a");

 # Wait until standby has replayed enough data on standby 1
-$node_primary->wait_for_catchup($node_standby_1, 'replay',
-    $node_primary->lsn('write'));
+$node_primary->wait_for_catchup($node_standby_1);

 # Stop and remove primary
 $node_primary->teardown_node;
@@ -64,8 +63,7 @@ $node_standby_2->restart;
 # to ensure that the timeline switch has been done.
 $node_standby_1->safe_psql('postgres',
     "INSERT INTO tab_int VALUES (generate_series(1001,2000))");
-$node_standby_1->wait_for_catchup($node_standby_2, 'replay',
-    $node_standby_1->lsn('write'));
+$node_standby_1->wait_for_catchup($node_standby_2);

 my $result =
   $node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int");
@@ -103,8 +101,7 @@ $node_primary_2->promote;
 $node_standby_3->start;
 $node_primary_2->safe_psql('postgres',
     "CREATE TABLE tab_int AS SELECT 1 AS a");
-$node_primary_2->wait_for_catchup($node_standby_3, 'replay',
-    $node_primary_2->lsn('write'));
+$node_primary_2->wait_for_catchup($node_standby_3);

 my $result_2 =
   $node_standby_3->safe_psql('postgres', "SELECT count(*) FROM tab_int");
diff --git a/src/test/recovery/t/010_logical_decoding_timelines.pl
b/src/test/recovery/t/010_logical_decoding_timelines.pl
index 648695928d..672cd74f3b 100644
--- a/src/test/recovery/t/010_logical_decoding_timelines.pl
+++ b/src/test/recovery/t/010_logical_decoding_timelines.pl
@@ -88,8 +88,7 @@ $node_replica->start;
 # db and associated slot.
 is($node_primary->psql('postgres', 'DROP DATABASE dropme'),
     0, 'dropped DB with logical slot OK on primary');
-$node_primary->wait_for_catchup($node_replica, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_replica);
 is( $node_replica->safe_psql(
         'postgres', q[SELECT 1 FROM pg_database WHERE datname = 'dropme']),
     '',
diff --git a/src/test/recovery/t/012_subtransactions.pl b/src/test/recovery/t/012_subtransactions.pl
index 150455b1be..2a558b2301 100644
--- a/src/test/recovery/t/012_subtransactions.pl
+++ b/src/test/recovery/t/012_subtransactions.pl
@@ -103,8 +103,7 @@ $node_primary->psql(
     BEGIN;
     SELECT hs_subxids(127);
     COMMIT;");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->psql(
     'postgres',
     "SELECT coalesce(sum(id),-1) FROM t_012_tbl",
@@ -150,8 +149,7 @@ $node_primary->psql(
     BEGIN;
     SELECT hs_subxids(127);
     PREPARE TRANSACTION 'xact_012_1';");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->psql(
     'postgres',
     "SELECT coalesce(sum(id),-1) FROM t_012_tbl",
@@ -187,8 +185,7 @@ $node_primary->psql(
     BEGIN;
     SELECT hs_subxids(201);
     PREPARE TRANSACTION 'xact_012_1';");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->psql(
     'postgres',
     "SELECT coalesce(sum(id),-1) FROM t_012_tbl",
diff --git a/src/test/recovery/t/015_promotion_pages.pl b/src/test/recovery/t/015_promotion_pages.pl
index c4f9e5e0ce..65443df5fe 100644
--- a/src/test/recovery/t/015_promotion_pages.pl
+++ b/src/test/recovery/t/015_promotion_pages.pl
@@ -44,7 +44,7 @@ $alpha->safe_psql('postgres', 'checkpoint');
 # problematic WAL records.
 $alpha->safe_psql('postgres', 'vacuum verbose test1');
 # Wait for last record to have been replayed on the standby.
-$alpha->wait_for_catchup($bravo, 'replay', $alpha->lsn('insert'));
+$alpha->wait_for_catchup($bravo);

 # Now force a checkpoint on the standby. This seems unnecessary but for "some"
 # reason, the previous checkpoint on the primary does not reflect on the standby
@@ -60,7 +60,7 @@ $alpha->safe_psql('postgres',
 $alpha->safe_psql('postgres', 'truncate test2');

 # Wait again for all records to be replayed.
-$alpha->wait_for_catchup($bravo, 'replay', $alpha->lsn('insert'));
+$alpha->wait_for_catchup($bravo);

 # Do the promotion, which reinitializes minRecoveryPoint in the control
 # file so as WAL is replayed up to the end.
diff --git a/src/test/recovery/t/016_min_consistency.pl b/src/test/recovery/t/016_min_consistency.pl
index 22e553510b..86fd6f5546 100644
--- a/src/test/recovery/t/016_min_consistency.pl
+++ b/src/test/recovery/t/016_min_consistency.pl
@@ -78,7 +78,7 @@ $primary->safe_psql('postgres', 'CHECKPOINT;');
 $primary->safe_psql('postgres', 'UPDATE test1 SET a = a + 1;');

 # Wait for last record to have been replayed on the standby.
-$primary->wait_for_catchup($standby, 'replay', $primary->lsn('insert'));
+$primary->wait_for_catchup($standby);

 # Fill in the standby's shared buffers with the data filled in
 # previously.
@@ -99,7 +99,7 @@ my $relfilenode = $primary->safe_psql('postgres',
     "SELECT pg_relation_filepath('test1'::regclass);");

 # Wait for last record to have been replayed on the standby.
-$primary->wait_for_catchup($standby, 'replay', $primary->lsn('insert'));
+$primary->wait_for_catchup($standby);

 # Issue a restart point on the standby now, which makes the checkpointer
 # update minRecoveryPoint.
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index 606399bb6f..47cfbdd646 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -49,8 +49,7 @@ $node_standby->append_conf('postgresql.conf', "primary_slot_name = 'rep1'");
 $node_standby->start;

 # Wait until standby has replayed enough data
-my $start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);

 # Stop standby
 $node_standby->stop;
@@ -84,8 +83,7 @@ is($result, "reserved|t", 'check that slot is working');
 # The standby can reconnect to primary
 $node_standby->start;

-$start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);

 $node_standby->stop;

@@ -115,8 +113,7 @@ is($result, "reserved",

 # The standby can reconnect to primary
 $node_standby->start;
-$start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->stop;

 # wal_keep_size overrides max_slot_wal_keep_size
@@ -135,8 +132,7 @@ $result = $node_primary->safe_psql('postgres',

 # The standby can reconnect to primary
 $node_standby->start;
-$start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->stop;

 # Advance WAL again without checkpoint, reducing remain by 6 MB.
@@ -163,8 +159,7 @@ is($result, "unreserved|t",
 # The standby still can connect to primary before a checkpoint
 $node_standby->start;

-$start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);

 $node_standby->stop;

@@ -334,7 +329,7 @@ $node_standby3->init_from_backup($node_primary3, $backup_name,
     has_streaming => 1);
 $node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'");
 $node_standby3->start;
-$node_primary3->wait_for_catchup($node_standby3->name, 'replay');
+$node_primary3->wait_for_catchup($node_standby3->name);
 my $senderpid = $node_primary3->safe_psql('postgres',
     "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
 like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
diff --git a/src/test/recovery/t/021_row_visibility.pl b/src/test/recovery/t/021_row_visibility.pl
index 4a62e47722..1ceb032750 100644
--- a/src/test/recovery/t/021_row_visibility.pl
+++ b/src/test/recovery/t/021_row_visibility.pl
@@ -76,8 +76,7 @@ ok( send_query_and_wait(
 #
 $node_primary->psql('postgres',
     "INSERT INTO test_visibility VALUES ('first insert')");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
         \%psql_standby,
@@ -98,8 +97,7 @@ UPDATE test_visibility SET data = 'first update' RETURNING data;
     'UPDATE');

 $node_primary->psql('postgres', "SELECT txid_current();");  # ensure WAL flush
-$node_primary->wait_for_catchup($node_standby, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
         \%psql_standby,
@@ -112,8 +110,7 @@ ok( send_query_and_wait(
 #
 ok(send_query_and_wait(\%psql_primary, q[COMMIT;], qr/^COMMIT$/m), 'COMMIT');

-$node_primary->wait_for_catchup($node_standby, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
         \%psql_standby,
@@ -142,8 +139,7 @@ PREPARE TRANSACTION 'will_abort';
         qr/^PREPARE TRANSACTION$/m),
     'prepared will_abort');

-$node_primary->wait_for_catchup($node_standby, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
         \%psql_standby,
@@ -154,8 +150,7 @@ ok( send_query_and_wait(
 # For some variation, finish prepared xacts via separate connections
 $node_primary->safe_psql('postgres', "COMMIT PREPARED 'will_commit';");
 $node_primary->safe_psql('postgres', "ROLLBACK PREPARED 'will_abort';");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-    $node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
         \%psql_standby,
diff --git a/src/test/recovery/t/025_stuck_on_old_timeline.pl b/src/test/recovery/t/025_stuck_on_old_timeline.pl
index 7997eecdd3..dc48f909c4 100644
--- a/src/test/recovery/t/025_stuck_on_old_timeline.pl
+++ b/src/test/recovery/t/025_stuck_on_old_timeline.pl
@@ -101,8 +101,7 @@ $node_cascade->start;
 $node_standby->safe_psql('postgres', "CREATE TABLE tab_int AS SELECT 1 AS a");

 # Wait for the replication to catch up
-$node_standby->wait_for_catchup($node_cascade, 'replay',
-    $node_standby->lsn('insert'));
+$node_standby->wait_for_catchup($node_cascade);

 # Check that cascading standby has the new content
 my $result =

pgsql-hackers by date:

Previous
From: Tomas Vondra
Date:
Subject: Re: sequences vs. synchronous replication
Next
From: Julien Rouhaud
Date:
Subject: Re: postgres_fdw: commit remote (sub)transactions in parallel during pre-commit