Re: Isn't wait_for_catchup slightly broken? - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: Isn't wait_for_catchup slightly broken? |
Date | |
Msg-id | 3612314.1642287482@sss.pgh.pa.us Whole thread Raw |
In response to | Isn't wait_for_catchup slightly broken? (Tom Lane <tgl@sss.pgh.pa.us>) |
Responses |
Re: Isn't wait_for_catchup slightly broken?
|
List | pgsql-hackers |
I wrote: > Another thing that is bothering me a bit is that a number of the > callers use $node->lsn('insert') as the target. This also seems > rather dubious, because that could be ahead of what's been written > out. These callers are just taking it on faith that something will > eventually cause that extra WAL to get written out (and become > available to the standby). Again, that seems to make the test > slower than it need be, with a worst-case scenario being that it > eventually times out. Admittedly this is unlikely to be a big > problem unless some background op issues an abortive transaction > at just the wrong time. Nonetheless, I wonder if we shouldn't > standardize on "thou shalt use the write position", because I > don't think the other alternatives have anything to recommend them. Here's a version that makes sure that callers specify a write position not an insert position. I also simplified the callers wherever it turned out that they could just use the default parameters. regards, tom lane diff --git a/src/bin/pg_basebackup/t/020_pg_receivewal.pl b/src/bin/pg_basebackup/t/020_pg_receivewal.pl index b616c0ccf1..98dbdab595 100644 --- a/src/bin/pg_basebackup/t/020_pg_receivewal.pl +++ b/src/bin/pg_basebackup/t/020_pg_receivewal.pl @@ -290,7 +290,7 @@ $standby->psql( "CREATE_REPLICATION_SLOT $archive_slot PHYSICAL (RESERVE_WAL)", replication => 1); # Wait for standby catchup -$primary->wait_for_catchup($standby, 'replay', $primary->lsn('write')); +$primary->wait_for_catchup($standby); # Get a walfilename from before the promotion to make sure it is archived # after promotion my $standby_slot = $standby->slot($archive_slot); diff --git a/src/bin/pg_rewind/t/007_standby_source.pl b/src/bin/pg_rewind/t/007_standby_source.pl index 62254344f6..239b510c1a 100644 --- a/src/bin/pg_rewind/t/007_standby_source.pl +++ b/src/bin/pg_rewind/t/007_standby_source.pl @@ -74,7 +74,7 @@ $node_a->safe_psql('postgres', "INSERT INTO tbl1 values ('in A, before promotion')"); $node_a->safe_psql('postgres', 'CHECKPOINT'); -my $lsn = $node_a->lsn('insert'); +my $lsn = $node_a->lsn('write'); $node_a->wait_for_catchup('node_b', 'write', $lsn); $node_b->wait_for_catchup('node_c', 'write', $lsn); @@ -93,8 +93,7 @@ $node_a->safe_psql('postgres', "INSERT INTO tbl1 VALUES ('in A, after C was promoted')"); # make sure it's replicated to B before we continue -$lsn = $node_a->lsn('insert'); -$node_a->wait_for_catchup('node_b', 'replay', $lsn); +$node_a->wait_for_catchup('node_b'); # Also insert a new row in the standby, which won't be present in the # old primary. @@ -161,7 +160,7 @@ in A, after C was promoted $node_a->safe_psql('postgres', "INSERT INTO tbl1 values ('in A, after rewind')"); -$lsn = $node_a->lsn('insert'); +$lsn = $node_a->lsn('write'); $node_b->wait_for_catchup('node_c', 'replay', $lsn); check_query( diff --git a/src/bin/pg_rewind/t/008_min_recovery_point.pl b/src/bin/pg_rewind/t/008_min_recovery_point.pl index 9ad7e6b62b..8240229230 100644 --- a/src/bin/pg_rewind/t/008_min_recovery_point.pl +++ b/src/bin/pg_rewind/t/008_min_recovery_point.pl @@ -69,8 +69,7 @@ $node_3->init_from_backup($node_1, $backup_name, has_streaming => 1); $node_3->start; # Wait until node 3 has connected and caught up -my $lsn = $node_1->lsn('insert'); -$node_1->wait_for_catchup('node_3', 'replay', $lsn); +$node_1->wait_for_catchup('node_3'); # # Swap the roles of node_1 and node_3, so that node_1 follows node_3. @@ -106,8 +105,7 @@ $node_2->restart(); # # make sure node_1 is full caught up with node_3 first -$lsn = $node_3->lsn('insert'); -$node_3->wait_for_catchup('node_1', 'replay', $lsn); +$node_3->wait_for_catchup('node_1'); $node_1->promote; # Force a checkpoint after promotion, like earlier. diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index e18f27276c..eceb8b2d8b 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -2509,8 +2509,11 @@ poll_query_until timeout. Requires that the 'postgres' db exists and is accessible. -target_lsn may be any arbitrary lsn, but is typically $primary_node->lsn('insert'). -If omitted, pg_current_wal_lsn() is used. +The default value of target_lsn is $node->lsn('write'), which ensures +that the standby has caught up to what has been committed on the primary. +If you pass an explicit value of target_lsn, it should almost always be +the primary's write LSN; so this parameter is seldom needed except when +querying some intermediate replication node rather than the primary. This is not a test. It die()s on failure. @@ -2531,23 +2534,18 @@ sub wait_for_catchup { $standby_name = $standby_name->name; } - my $lsn_expr; - if (defined($target_lsn)) + if (!defined($target_lsn)) { - $lsn_expr = "'$target_lsn'"; - } - else - { - $lsn_expr = 'pg_current_wal_lsn()'; + $target_lsn = $self->lsn('write'); } print "Waiting for replication conn " . $standby_name . "'s " . $mode . "_lsn to pass " - . $lsn_expr . " on " + . $target_lsn . " on " . $self->name . "\n"; my $query = - qq[SELECT $lsn_expr <= ${mode}_lsn AND state = 'streaming' FROM pg_catalog.pg_stat_replication WHERE application_name= '$standby_name';]; + qq[SELECT '$target_lsn' <= ${mode}_lsn AND state = 'streaming' FROM pg_catalog.pg_stat_replication WHERE application_name= '$standby_name';]; $self->poll_query_until('postgres', $query) or croak "timed out waiting for catchup"; print "done\n"; diff --git a/src/test/recovery/t/001_stream_rep.pl b/src/test/recovery/t/001_stream_rep.pl index fb8f66878a..ca760c7210 100644 --- a/src/test/recovery/t/001_stream_rep.pl +++ b/src/test/recovery/t/001_stream_rep.pl @@ -47,10 +47,9 @@ $node_primary->safe_psql('postgres', "CREATE TABLE tab_int AS SELECT generate_series(1,1002) AS a"); # Wait for standbys to catch up -$node_primary->wait_for_catchup($node_standby_1, 'replay', - $node_primary->lsn('insert')); -$node_standby_1->wait_for_catchup($node_standby_2, 'replay', - $node_standby_1->lsn('replay')); +my $primary_lsn = $node_primary->lsn('write'); +$node_primary->wait_for_catchup($node_standby_1, 'replay', $primary_lsn); +$node_standby_1->wait_for_catchup($node_standby_2, 'replay', $primary_lsn); my $result = $node_standby_1->safe_psql('postgres', "SELECT count(*) FROM tab_int"); @@ -67,10 +66,9 @@ $node_primary->safe_psql('postgres', "CREATE SEQUENCE seq1; SELECT nextval('seq1')"); # Wait for standbys to catch up -$node_primary->wait_for_catchup($node_standby_1, 'replay', - $node_primary->lsn('insert')); -$node_standby_1->wait_for_catchup($node_standby_2, 'replay', - $node_standby_1->lsn('replay')); +$primary_lsn = $node_primary->lsn('write'); +$node_primary->wait_for_catchup($node_standby_1, 'replay', $primary_lsn); +$node_standby_1->wait_for_catchup($node_standby_2, 'replay', $primary_lsn); $result = $node_standby_1->safe_psql('postgres', "SELECT * FROM seq1"); print "standby 1: $result\n"; @@ -374,10 +372,10 @@ sub replay_check my $newval = $node_primary->safe_psql('postgres', 'INSERT INTO replayed(val) SELECT coalesce(max(val),0) + 1 AS newval FROM replayed RETURNING val' ); - $node_primary->wait_for_catchup($node_standby_1, 'replay', - $node_primary->lsn('insert')); - $node_standby_1->wait_for_catchup($node_standby_2, 'replay', - $node_standby_1->lsn('replay')); + my $primary_lsn = $node_primary->lsn('write'); + $node_primary->wait_for_catchup($node_standby_1, 'replay', $primary_lsn); + $node_standby_1->wait_for_catchup($node_standby_2, 'replay', $primary_lsn); + $node_standby_1->safe_psql('postgres', qq[SELECT 1 FROM replayed WHERE val = $newval]) or die "standby_1 didn't replay primary value $newval"; @@ -481,8 +479,7 @@ $node_standby_1->stop; my $newval = $node_primary->safe_psql('postgres', 'INSERT INTO replayed(val) SELECT coalesce(max(val),0) + 1 AS newval FROM replayed RETURNING val' ); -$node_primary->wait_for_catchup($node_standby_2, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_standby_2); my $is_replayed = $node_standby_2->safe_psql('postgres', qq[SELECT 1 FROM replayed WHERE val = $newval]); is($is_replayed, qq(1), "standby_2 didn't replay primary value $newval"); diff --git a/src/test/recovery/t/004_timeline_switch.pl b/src/test/recovery/t/004_timeline_switch.pl index 333d37b0a4..05b7931668 100644 --- a/src/test/recovery/t/004_timeline_switch.pl +++ b/src/test/recovery/t/004_timeline_switch.pl @@ -38,8 +38,7 @@ $node_primary->safe_psql('postgres', "CREATE TABLE tab_int AS SELECT generate_series(1,1000) AS a"); # Wait until standby has replayed enough data on standby 1 -$node_primary->wait_for_catchup($node_standby_1, 'replay', - $node_primary->lsn('write')); +$node_primary->wait_for_catchup($node_standby_1); # Stop and remove primary $node_primary->teardown_node; @@ -64,8 +63,7 @@ $node_standby_2->restart; # to ensure that the timeline switch has been done. $node_standby_1->safe_psql('postgres', "INSERT INTO tab_int VALUES (generate_series(1001,2000))"); -$node_standby_1->wait_for_catchup($node_standby_2, 'replay', - $node_standby_1->lsn('write')); +$node_standby_1->wait_for_catchup($node_standby_2); my $result = $node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int"); @@ -103,8 +101,7 @@ $node_primary_2->promote; $node_standby_3->start; $node_primary_2->safe_psql('postgres', "CREATE TABLE tab_int AS SELECT 1 AS a"); -$node_primary_2->wait_for_catchup($node_standby_3, 'replay', - $node_primary_2->lsn('write')); +$node_primary_2->wait_for_catchup($node_standby_3); my $result_2 = $node_standby_3->safe_psql('postgres', "SELECT count(*) FROM tab_int"); diff --git a/src/test/recovery/t/010_logical_decoding_timelines.pl b/src/test/recovery/t/010_logical_decoding_timelines.pl index 648695928d..672cd74f3b 100644 --- a/src/test/recovery/t/010_logical_decoding_timelines.pl +++ b/src/test/recovery/t/010_logical_decoding_timelines.pl @@ -88,8 +88,7 @@ $node_replica->start; # db and associated slot. is($node_primary->psql('postgres', 'DROP DATABASE dropme'), 0, 'dropped DB with logical slot OK on primary'); -$node_primary->wait_for_catchup($node_replica, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_replica); is( $node_replica->safe_psql( 'postgres', q[SELECT 1 FROM pg_database WHERE datname = 'dropme']), '', diff --git a/src/test/recovery/t/012_subtransactions.pl b/src/test/recovery/t/012_subtransactions.pl index 150455b1be..2a558b2301 100644 --- a/src/test/recovery/t/012_subtransactions.pl +++ b/src/test/recovery/t/012_subtransactions.pl @@ -103,8 +103,7 @@ $node_primary->psql( BEGIN; SELECT hs_subxids(127); COMMIT;"); -$node_primary->wait_for_catchup($node_standby, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_standby); $node_standby->psql( 'postgres', "SELECT coalesce(sum(id),-1) FROM t_012_tbl", @@ -150,8 +149,7 @@ $node_primary->psql( BEGIN; SELECT hs_subxids(127); PREPARE TRANSACTION 'xact_012_1';"); -$node_primary->wait_for_catchup($node_standby, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_standby); $node_standby->psql( 'postgres', "SELECT coalesce(sum(id),-1) FROM t_012_tbl", @@ -187,8 +185,7 @@ $node_primary->psql( BEGIN; SELECT hs_subxids(201); PREPARE TRANSACTION 'xact_012_1';"); -$node_primary->wait_for_catchup($node_standby, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_standby); $node_standby->psql( 'postgres', "SELECT coalesce(sum(id),-1) FROM t_012_tbl", diff --git a/src/test/recovery/t/015_promotion_pages.pl b/src/test/recovery/t/015_promotion_pages.pl index c4f9e5e0ce..65443df5fe 100644 --- a/src/test/recovery/t/015_promotion_pages.pl +++ b/src/test/recovery/t/015_promotion_pages.pl @@ -44,7 +44,7 @@ $alpha->safe_psql('postgres', 'checkpoint'); # problematic WAL records. $alpha->safe_psql('postgres', 'vacuum verbose test1'); # Wait for last record to have been replayed on the standby. -$alpha->wait_for_catchup($bravo, 'replay', $alpha->lsn('insert')); +$alpha->wait_for_catchup($bravo); # Now force a checkpoint on the standby. This seems unnecessary but for "some" # reason, the previous checkpoint on the primary does not reflect on the standby @@ -60,7 +60,7 @@ $alpha->safe_psql('postgres', $alpha->safe_psql('postgres', 'truncate test2'); # Wait again for all records to be replayed. -$alpha->wait_for_catchup($bravo, 'replay', $alpha->lsn('insert')); +$alpha->wait_for_catchup($bravo); # Do the promotion, which reinitializes minRecoveryPoint in the control # file so as WAL is replayed up to the end. diff --git a/src/test/recovery/t/016_min_consistency.pl b/src/test/recovery/t/016_min_consistency.pl index 22e553510b..86fd6f5546 100644 --- a/src/test/recovery/t/016_min_consistency.pl +++ b/src/test/recovery/t/016_min_consistency.pl @@ -78,7 +78,7 @@ $primary->safe_psql('postgres', 'CHECKPOINT;'); $primary->safe_psql('postgres', 'UPDATE test1 SET a = a + 1;'); # Wait for last record to have been replayed on the standby. -$primary->wait_for_catchup($standby, 'replay', $primary->lsn('insert')); +$primary->wait_for_catchup($standby); # Fill in the standby's shared buffers with the data filled in # previously. @@ -99,7 +99,7 @@ my $relfilenode = $primary->safe_psql('postgres', "SELECT pg_relation_filepath('test1'::regclass);"); # Wait for last record to have been replayed on the standby. -$primary->wait_for_catchup($standby, 'replay', $primary->lsn('insert')); +$primary->wait_for_catchup($standby); # Issue a restart point on the standby now, which makes the checkpointer # update minRecoveryPoint. diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl index 606399bb6f..47cfbdd646 100644 --- a/src/test/recovery/t/019_replslot_limit.pl +++ b/src/test/recovery/t/019_replslot_limit.pl @@ -49,8 +49,7 @@ $node_standby->append_conf('postgresql.conf', "primary_slot_name = 'rep1'"); $node_standby->start; # Wait until standby has replayed enough data -my $start_lsn = $node_primary->lsn('write'); -$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn); +$node_primary->wait_for_catchup($node_standby); # Stop standby $node_standby->stop; @@ -84,8 +83,7 @@ is($result, "reserved|t", 'check that slot is working'); # The standby can reconnect to primary $node_standby->start; -$start_lsn = $node_primary->lsn('write'); -$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn); +$node_primary->wait_for_catchup($node_standby); $node_standby->stop; @@ -115,8 +113,7 @@ is($result, "reserved", # The standby can reconnect to primary $node_standby->start; -$start_lsn = $node_primary->lsn('write'); -$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn); +$node_primary->wait_for_catchup($node_standby); $node_standby->stop; # wal_keep_size overrides max_slot_wal_keep_size @@ -135,8 +132,7 @@ $result = $node_primary->safe_psql('postgres', # The standby can reconnect to primary $node_standby->start; -$start_lsn = $node_primary->lsn('write'); -$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn); +$node_primary->wait_for_catchup($node_standby); $node_standby->stop; # Advance WAL again without checkpoint, reducing remain by 6 MB. @@ -163,8 +159,7 @@ is($result, "unreserved|t", # The standby still can connect to primary before a checkpoint $node_standby->start; -$start_lsn = $node_primary->lsn('write'); -$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn); +$node_primary->wait_for_catchup($node_standby); $node_standby->stop; @@ -334,7 +329,7 @@ $node_standby3->init_from_backup($node_primary3, $backup_name, has_streaming => 1); $node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'"); $node_standby3->start; -$node_primary3->wait_for_catchup($node_standby3->name, 'replay'); +$node_primary3->wait_for_catchup($node_standby3->name); my $senderpid = $node_primary3->safe_psql('postgres', "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'"); like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"); diff --git a/src/test/recovery/t/021_row_visibility.pl b/src/test/recovery/t/021_row_visibility.pl index 4a62e47722..1ceb032750 100644 --- a/src/test/recovery/t/021_row_visibility.pl +++ b/src/test/recovery/t/021_row_visibility.pl @@ -76,8 +76,7 @@ ok( send_query_and_wait( # $node_primary->psql('postgres', "INSERT INTO test_visibility VALUES ('first insert')"); -$node_primary->wait_for_catchup($node_standby, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_standby); ok( send_query_and_wait( \%psql_standby, @@ -98,8 +97,7 @@ UPDATE test_visibility SET data = 'first update' RETURNING data; 'UPDATE'); $node_primary->psql('postgres', "SELECT txid_current();"); # ensure WAL flush -$node_primary->wait_for_catchup($node_standby, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_standby); ok( send_query_and_wait( \%psql_standby, @@ -112,8 +110,7 @@ ok( send_query_and_wait( # ok(send_query_and_wait(\%psql_primary, q[COMMIT;], qr/^COMMIT$/m), 'COMMIT'); -$node_primary->wait_for_catchup($node_standby, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_standby); ok( send_query_and_wait( \%psql_standby, @@ -142,8 +139,7 @@ PREPARE TRANSACTION 'will_abort'; qr/^PREPARE TRANSACTION$/m), 'prepared will_abort'); -$node_primary->wait_for_catchup($node_standby, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_standby); ok( send_query_and_wait( \%psql_standby, @@ -154,8 +150,7 @@ ok( send_query_and_wait( # For some variation, finish prepared xacts via separate connections $node_primary->safe_psql('postgres', "COMMIT PREPARED 'will_commit';"); $node_primary->safe_psql('postgres', "ROLLBACK PREPARED 'will_abort';"); -$node_primary->wait_for_catchup($node_standby, 'replay', - $node_primary->lsn('insert')); +$node_primary->wait_for_catchup($node_standby); ok( send_query_and_wait( \%psql_standby, diff --git a/src/test/recovery/t/025_stuck_on_old_timeline.pl b/src/test/recovery/t/025_stuck_on_old_timeline.pl index 7997eecdd3..dc48f909c4 100644 --- a/src/test/recovery/t/025_stuck_on_old_timeline.pl +++ b/src/test/recovery/t/025_stuck_on_old_timeline.pl @@ -101,8 +101,7 @@ $node_cascade->start; $node_standby->safe_psql('postgres', "CREATE TABLE tab_int AS SELECT 1 AS a"); # Wait for the replication to catch up -$node_standby->wait_for_catchup($node_cascade, 'replay', - $node_standby->lsn('insert')); +$node_standby->wait_for_catchup($node_cascade); # Check that cascading standby has the new content my $result =
pgsql-hackers by date: