From c87f9d7eaf2720944d625cb529b6cc355fec7771 Mon Sep 17 00:00:00 2001 From: Jimmy Yih Date: Wed, 9 Aug 2023 16:50:04 -0700 Subject: [PATCH] Allow recovery to proceed when initial timeline history file is missing For WAL archive recovery, setting recovery_target_timeline to 'current' or 'latest' will only output a WARNING if the initial timeline history file cannot be found/retrieved as it proceeds with recovery without any issues. However, setting the recovery_target_timeline explicitly to the current control file's timeline id (similar to what 'current' does) will result in a FATAL when the initial timeline history file cannot be found/retrieved. Since 'current' and 'latest' work fine, we should also not FATAL when the timeline history file cannot be found/retrieved for when recovery_target_timeline is explicitly set to the same timeline id from the control file. For WAL streaming, the standby's WAL receiver will FATAL and loop on trying to retrieve the initial timeline history from the primary (or standby in the case of cascading). However, it doesn't seem to be required if the above WAL archive recovery claims are valid. To align with the same logic, we should also not FATAL when the WAL receiver cannot find/retrieve the initial timeline history file. --- src/backend/access/transam/xlogrecovery.c | 2 +- .../libpqwalreceiver/libpqwalreceiver.c | 26 +++- src/backend/replication/walreceiver.c | 12 +- src/include/replication/walreceiver.h | 7 +- ...andbys_with_no_initial_timeline_history.pl | 136 ++++++++++++++++++ 5 files changed, 171 insertions(+), 12 deletions(-) create mode 100644 src/test/recovery/t/038_standbys_with_no_initial_timeline_history.pl diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index becc2bda62..b223a82cfb 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1124,7 +1124,7 @@ validateRecoveryParameters(void) TimeLineID rtli = recoveryTargetTLIRequested; /* Timeline 1 does not have a history file, all else should */ - if (rtli != 1 && !existsTimeLineHistory(rtli)) + if (rtli != 1 && !existsTimeLineHistory(rtli) && rtli != recoveryTargetTLI) ereport(FATAL, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("recovery target timeline %u does not exist", diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index 60d5c1fc40..ba12ed3911 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -61,7 +61,7 @@ static char *libpqrcv_identify_system(WalReceiverConn *conn, static int libpqrcv_server_version(WalReceiverConn *conn); static void libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, TimeLineID tli, char **filename, - char **content, int *len); + char **content, int *len, bool missing_ok); static bool libpqrcv_startstreaming(WalReceiverConn *conn, const WalRcvStreamOptions *options); static void libpqrcv_endstreaming(WalReceiverConn *conn, @@ -603,7 +603,7 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli) static void libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, TimeLineID tli, char **filename, - char **content, int *len) + char **content, int *len, bool missing_ok) { PGresult *res; char cmd[64]; @@ -618,11 +618,23 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, if (PQresultStatus(res) != PGRES_TUPLES_OK) { PQclear(res); - ereport(ERROR, - (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("could not receive timeline history file from " - "the primary server: %s", - pchomp(PQerrorMessage(conn->streamConn))))); + + if (missing_ok) + { + *filename = NULL; + *content = NULL; + ereport(WARNING, + (errmsg("could not receive timeline history file from " + "the primary server: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + return; + } + else + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not receive timeline history file from " + "the primary server: %s", + pchomp(PQerrorMessage(conn->streamConn))))); } if (PQnfields(res) != 2 || PQntuples(res) != 1) { diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index feff709435..26368fe4f3 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -754,12 +754,22 @@ WalRcvFetchTimeLineHistoryFiles(TimeLineID first, TimeLineID last) char *content; int len; char expectedfname[MAXFNAMELEN]; + bool missing_ok; ereport(LOG, (errmsg("fetching timeline history file for timeline %u from primary server", tli))); - walrcv_readtimelinehistoryfile(wrconn, tli, &fname, &content, &len); + missing_ok = (tli == first); + walrcv_readtimelinehistoryfile(wrconn, tli, &fname, &content, &len, missing_ok); + + /* + * If the requested timeline id is the first one, we can overlook + * the timeline history file fetch error since it's not required + * to start the standby. + */ + if (missing_ok && fname == NULL && content == NULL) + continue; /* * Check that the filename on the primary matches what we diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h index 281626fa6f..197c82bb53 100644 --- a/src/include/replication/walreceiver.h +++ b/src/include/replication/walreceiver.h @@ -298,7 +298,8 @@ typedef void (*walrcv_readtimelinehistoryfile_fn) (WalReceiverConn *conn, TimeLineID tli, char **filename, char **content, - int *size); + int *size, + bool missing_ok); /* * walrcv_startstreaming_fn @@ -419,8 +420,8 @@ extern PGDLLIMPORT WalReceiverFunctionsType *WalReceiverFunctions; WalReceiverFunctions->walrcv_identify_system(conn, primary_tli) #define walrcv_server_version(conn) \ WalReceiverFunctions->walrcv_server_version(conn) -#define walrcv_readtimelinehistoryfile(conn, tli, filename, content, size) \ - WalReceiverFunctions->walrcv_readtimelinehistoryfile(conn, tli, filename, content, size) +#define walrcv_readtimelinehistoryfile(conn, tli, filename, content, size, missing_ok) \ + WalReceiverFunctions->walrcv_readtimelinehistoryfile(conn, tli, filename, content, size, missing_ok) #define walrcv_startstreaming(conn, options) \ WalReceiverFunctions->walrcv_startstreaming(conn, options) #define walrcv_endstreaming(conn, next_tli) \ diff --git a/src/test/recovery/t/038_standbys_with_no_initial_timeline_history.pl b/src/test/recovery/t/038_standbys_with_no_initial_timeline_history.pl new file mode 100644 index 0000000000..9f2b33ed18 --- /dev/null +++ b/src/test/recovery/t/038_standbys_with_no_initial_timeline_history.pl @@ -0,0 +1,136 @@ +# Test that setting up and starting WAL archiving on an +# already-promoted node will result in the archival of its current +# timeline history file. + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +$ENV{PGDATABASE} = 'postgres'; + +# Initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->start; + +# Take a backup +my $backup_name = 'my_backup_1'; +$node_primary->backup($backup_name); + +# Create a standby that will be promoted onto timeline 2 +my $node_primary_tli2 = PostgreSQL::Test::Cluster->new('primary_tli2'); +$node_primary_tli2->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_primary_tli2->start; + +# Stop and remove the primary; it's not needed anymore +$node_primary->teardown_node; + +# Promote the standby using "pg_promote", switching it to timeline 2 +my $psql_out = ''; +$node_primary_tli2->psql( + 'postgres', + "SELECT pg_promote(wait_seconds => 300);", + stdout => \$psql_out); +is($psql_out, 't', "promotion of standby with pg_promote"); + +# Enable archiving on the promoted node. +$node_primary_tli2->enable_archiving; +$node_primary_tli2->restart; + +# Check that the timeline 2 history file was not archived after +# enabling WAL archiving since timeline history files are only +# archived at the moment of switching timelines and not any time +# after. +my $primary_tli2_archive = $node_primary_tli2->archive_dir; +my $primary_tli2_datadir = $node_primary_tli2->data_dir; +ok(-f "$primary_tli2_datadir/pg_wal/00000002.history", + 'timeline 2 history file was created'); +ok(! -f "$primary_tli2_datadir/pg_wal/archive_status/00000002.history.ready", + 'timeline 2 history file was not marked for WAL archiving'); +ok(! -f "$primary_tli2_datadir/pg_wal/archive_status/00000002.history.done", + 'timeline 2 history file was not archived archived'); +ok(! -f "$primary_tli2_archive/00000002.history", + 'timeline 2 history file does not exist in the archive'); + +# Take backup of node_primary_tli2 and use -Xnone so that pg_wal will +# be empty and restore will retrieve the necessary WAL and timeline +# history file(s) from the archive. +$backup_name = 'my_backup_2'; +$node_primary_tli2->backup($backup_name, backup_options => ['-Xnone']); + +# Create simple WAL that will be archived and restored +$node_primary_tli2->safe_psql('postgres', "CREATE TABLE tab_int AS SELECT 8 AS a;"); + +# Create a restore point to later use as the recovery_target_name +my $recovery_name = "my_target"; +$node_primary_tli2->safe_psql('postgres', + "SELECT pg_create_restore_point('$recovery_name');"); + +# Find the next WAL segment to be archived +my $walfile_to_be_archived = $node_primary_tli2->safe_psql('postgres', + "SELECT pg_walfile_name(pg_current_wal_lsn());"); + +# Make the WAL segment eligible for archival +$node_primary_tli2->safe_psql('postgres', 'SELECT pg_switch_wal();'); + +# Wait until the WAL segment has been archived +my $archive_wait_query = + "SELECT '$walfile_to_be_archived' <= last_archived_wal FROM pg_stat_archiver;"; +$node_primary_tli2->poll_query_until('postgres', $archive_wait_query) + or die "Timed out while waiting for WAL segment to be archived"; +$node_primary_tli2->teardown_node; + +# Initialize a new standby node from the backup. This node will start +# off on timeline 2 according to the control file and will finish +# recovery onto the same timeline by explicitly setting +# recovery_target_timeline to '2'. We explicitly set the target +# timeline to show that it doesn't require the timeline history file +# and works the same as if we used 'current' or 'latest'. +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary_tli2, $backup_name, + has_restoring => 1, standby => 0); +$node_standby->append_conf('postgresql.conf', qq{ +recovery_target_timeline = '2' +recovery_target_action = 'pause' +recovery_target_name = 'my_target' +archive_mode = 'off' +primary_conninfo = '' +}); +$node_standby->start; + +# Check that the timeline history file was not retrieved +ok ( ! -f $node_standby->data_dir . "/pg_wal/00000002.history", + "00000002.history does not exist in the standby's pg_wal directory"); + +# Sanity check that the node is queryable +my $result_standby = + $node_standby->safe_psql('postgres', "SELECT timeline_id FROM pg_control_checkpoint();"); +is($result_standby, qq(2), 'check that the node is on timeline 2'); +$result_standby = + $node_standby->safe_psql('postgres', "SELECT * FROM tab_int;"); +is($result_standby, qq(8), 'check that the node did archive recovery'); + +# Set up a cascade standby node to validate that there's no issues +# since the WAL receiver will request all necessary timeline history +# files from the standby node's WAL sender. +my $node_cascade = PostgreSQL::Test::Cluster->new('cascade'); +$node_cascade->init_from_backup($node_primary_tli2, $backup_name, + standby => 1); +$node_cascade->enable_streaming($node_standby); +$node_cascade->start; + +# Wait for the replication to catch up +$node_standby->wait_for_catchup($node_cascade); + +# Sanity check that the cascade standby node came up and is queryable +my $result_cascade = + $node_cascade->safe_psql('postgres', "SELECT * FROM tab_int;"); +is($result_cascade, qq(8), 'check that the node received the streamed WAL data'); + +$node_standby->teardown_node; +$node_cascade->teardown_node; + +done_testing(); -- 2.24.3 (Apple Git-128)