From 9930c1856d1e8e2f027276d6599c1eff7866b9db Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Fri, 20 Feb 2026 00:05:03 +0500 Subject: [PATCH] Fix archive recovery falling back to wrong-timeline WAL segment XLogFileReadAnyTLI iterates expectedTLEs newest-first and, when the correct timeline's segment is absent from the archive, falls back to an older timeline's segment for the same position. Past a switch point that older segment carries divergent WAL, so recovery silently applies wrong data. The correct invariant: for any segment, the owner is the newest timeline in expectedTLEs whose begin_seg <= segno. If that segment is absent, recovery must stop rather than fall back. Replace the targetBeginSeg pre-check (which only guarded the final switch point) with a found_eligible flag: once the first eligible timeline is identified and its segment is not found, break out of the loop instead of continuing to older timelines. Add two TAP tests: 052 covers the basic two-timeline case; 053 covers a three-timeline chain where the intermediate switch-point segment is absent, which the old targetBeginSeg approach did not catch. --- src/backend/access/transam/xlogrecovery.c | 14 ++ .../052_timeline_switch_archive_divergence.pl | 92 ++++++++++++ .../t/053_timeline_switch_intermediate_tl.pl | 142 ++++++++++++++++++ 3 files changed, 248 insertions(+) create mode 100644 src/test/recovery/t/052_timeline_switch_archive_divergence.pl create mode 100644 src/test/recovery/t/053_timeline_switch_intermediate_tl.pl diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index c0c2744d45b..b4c4e87e0fb 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -4345,6 +4345,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source) ListCell *cell; int fd; List *tles; + bool found_eligible; /* * Loop looking for a suitable timeline ID: we might need to read any of @@ -4369,6 +4370,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source) else tles = readTimeLineHistory(recoveryTargetTLI); + found_eligible = false; foreach(cell, tles) { TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell); @@ -4401,6 +4403,18 @@ XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source) continue; } + /* + * This is the first (newest) timeline eligible for this segment. + * Older timelines that also pass the beginseg check have divergent + * WAL starting at their own switch point: once a child timeline + * branches off, the parent's WAL is no longer valid for the child's + * recovery path. If the correct timeline's segment isn't available, + * we must not silently fall back to a parent with wrong data. + */ + if (found_eligible) + break; + found_eligible = true; + if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE) { fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true); diff --git a/src/test/recovery/t/052_timeline_switch_archive_divergence.pl b/src/test/recovery/t/052_timeline_switch_archive_divergence.pl new file mode 100644 index 00000000000..aef9569a046 --- /dev/null +++ b/src/test/recovery/t/052_timeline_switch_archive_divergence.pl @@ -0,0 +1,92 @@ + +# Copyright (c) 2021-2026, PostgreSQL Global Development Group + +# Test that archive recovery with recovery_target_timeline='latest' does not +# use a parent timeline's WAL segment for the segment containing the switch +# point when the child timeline's segment is absent from the archive. +# +# Setup: TL1 archives segments 1..3 (segment 3 has data past the switch point). +# Only the TL2 timeline history file is added to archive; TL2 segment 3 is not. +# Recovery is performed with recovery_target_timeline = '2'. Without the fix, +# recovery uses TL1 segment 3 even though the switch point is in TL2 segment 3. +# With the fix, recovery skips TL1 for that segment and correctly waits for TL2. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use File::Copy; + +# Initialize primary with WAL archiving. +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1, has_archiving => 1); +$node_primary->start; + +$node_primary->safe_psql('postgres', 'CREATE TABLE t (i int)'); +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# Create a streaming standby so we can promote it. Disable archiving so +# it does not inherit the primary's archive_command and archive TL2 segments +# into the primary's archive (which would mask the bug by making TL2 seg 3 +# available when it should not be). +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1); +$node_standby->append_conf('postgresql.conf', "archive_mode = off"); +$node_standby->start; +$node_primary->wait_for_catchup($node_standby); + +# Force a segment boundary: switch to segment 3, so the switch point will +# land inside segment 3 (both TL1 and TL2 will have a segment 3). +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); +$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (1)'); +$node_primary->wait_for_catchup($node_standby); + +# Promote standby to TL2. The timeline history file (00000002.history) is +# written to pg_wal immediately upon promotion. +$node_standby->promote; +$node_standby->poll_query_until('postgres', 'SELECT NOT pg_is_in_recovery()') + or die "Timed out waiting for promotion"; + +# Old primary writes to segment 3 and archives it. This segment overlaps +# the switch point but is on TL1 -- recovery must NOT use it for TL2. +$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (-1)'); +my $old_walfile = $node_primary->safe_psql('postgres', + 'SELECT pg_walfile_name(pg_current_wal_lsn())'); +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); +$node_primary->poll_query_until('postgres', + "SELECT last_archived_wal >= '$old_walfile' FROM pg_stat_archiver") + or die "Timed out waiting for old primary to archive"; + +# Add TL2 timeline history to the archive by copying it directly from the +# standby's pg_wal. This tells recovery that TL2 exists, but TL2 segment 3 +# is intentionally absent so recovery must not fall back to TL1 segment 3. +my $archive = $node_primary->archive_dir; +copy($node_standby->data_dir . '/pg_wal/00000002.history', + "$archive/00000002.history") + or die "Could not copy 00000002.history: $!"; + +# Stop both nodes. Recovery will run archive-only, no streaming source. +$node_primary->stop; +$node_standby->stop; + +# Create a recovery node using old primary's archive only (no streaming). +my $node_rec = PostgreSQL::Test::Cluster->new('recovering'); +$node_rec->init_from_backup($node_primary, $backup_name, has_restoring => 1); +$node_rec->enable_restoring($node_primary, 1); +$node_rec->append_conf('postgresql.conf', "recovery_target_timeline = '2'"); + +$node_rec->start; + +# Give recovery a moment to attempt restoring the switch-point segment. +$node_rec->poll_query_until('postgres', 'SELECT pg_is_in_recovery()', 't') + or die "Node is not in recovery"; + +# With the fix: recovery skips TL1 for the switch-point segment and waits +# for TL2 (which is absent). Without the fix: it restores TL1 segment 3. +my $log_content = slurp_file($node_rec->logfile); +unlike($log_content, qr/restored log file "000000010000000000000003"/, + 'archive recovery did not use TL1 segment 3 past the switch point'); + +done_testing(); diff --git a/src/test/recovery/t/053_timeline_switch_intermediate_tl.pl b/src/test/recovery/t/053_timeline_switch_intermediate_tl.pl new file mode 100644 index 00000000000..9801ebda2c4 --- /dev/null +++ b/src/test/recovery/t/053_timeline_switch_intermediate_tl.pl @@ -0,0 +1,142 @@ + +# Copyright (c) 2021-2026, PostgreSQL Global Development Group + +# Test that archive recovery with three timelines does not fall back to a +# grandparent TL segment when the intermediate TL segment at its own switch +# point is absent from the archive. +# +# Topology: TL1 -> TL2 (switch in seg N+1) -> TL3 (switch in seg N+2). +# The base backup is taken while the primary is still in segment N. +# +# Archive contains TL1 segments N, N+1, N+2 and the TL3 history file, but +# NOT TL2 segment N+1 (the segment containing the TL1->TL2 switch point). +# +# Recovery target is TL3. For segment N+1, TL2 is the first eligible timeline +# (its begin_seg == N+1). TL2 segment N+1 is absent, so a correct +# implementation must stop there and not fall back to TL1 segment N+1, which +# carries divergent WAL from the old primary after the switch. +# +# A fix that only guards the final timeline's switch point (checking +# targetBeginSeg = TL3.begin_seg = N+2) would still allow TL1 segment N+1 +# to be used because segno N+1 < targetBeginSeg N+2. The correct fix must +# stop at the first eligible timeline for each segment: once TL2 is identified +# as eligible for segment N+1 but not found, recovery must not try TL1. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use File::Copy; + +# Primary (TL1) with WAL archiving. +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1, has_archiving => 1); +$node_primary->start; +$node_primary->safe_psql('postgres', 'CREATE TABLE t (i int)'); + +# Take the base backup in segment N. Recovery will start from this backup +# and must replay segment N+1 where the TL2 switch point lands. +$node_primary->backup('primary_backup'); + +# Switch to segment N+1. This is where the TL1->TL2 switch point will land. +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); +$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (1)'); + +# Capture the TL1 file name for segment N+1. We will assert later that +# archive recovery did NOT restore this file (the TL1 version carries +# divergent WAL past the switch point). +my $tl2_switch_seg = $node_primary->safe_psql('postgres', + 'SELECT pg_walfile_name(pg_current_wal_lsn())'); + +# standby1 streams from primary. Disable archiving so it does not publish +# TL2 segments into the primary's archive, which would mask the bug. +my $node_standby1 = PostgreSQL::Test::Cluster->new('standby1'); +$node_standby1->init_from_backup($node_primary, 'primary_backup', + has_streaming => 1); +$node_standby1->append_conf('postgresql.conf', "archive_mode = off"); +$node_standby1->start; +$node_primary->wait_for_catchup($node_standby1); + +# Promote standby1 while the primary is writing in segment N+1. The TL1->TL2 +# switch point therefore falls inside segment N+1. +$node_standby1->promote; +$node_standby1->poll_query_until('postgres', 'SELECT NOT pg_is_in_recovery()') + or die "Timed out waiting for standby1 promotion to TL2"; + +# Old primary (TL1) continues writing divergent WAL in segment N+1 and then +# N+2 and archives them. These are the segments that recovery must not use. +$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (-1)'); +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); +$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (-2)'); +my $tl1_last_seg = $node_primary->safe_psql('postgres', + 'SELECT pg_walfile_name(pg_current_wal_lsn())'); +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); +$node_primary->poll_query_until('postgres', + "SELECT last_archived_wal >= '$tl1_last_seg' FROM pg_stat_archiver") + or die "Timed out waiting for primary to archive divergent TL1 segments"; + +# On TL2 (standby1 is now the primary), switch to segment N+2 so the TL2->TL3 +# switch point lands one segment further than the TL1->TL2 switch. +$node_standby1->safe_psql('postgres', 'SELECT pg_switch_wal()'); +$node_standby1->safe_psql('postgres', 'INSERT INTO t VALUES (2)'); + +# Take a backup from the TL2 primary for standby2. +$node_standby1->backup('standby1_backup'); + +# standby2 streams from standby1 (TL2). Same archiving restriction. +my $node_standby2 = PostgreSQL::Test::Cluster->new('standby2'); +$node_standby2->init_from_backup($node_standby1, 'standby1_backup', + has_streaming => 1); +$node_standby2->append_conf('postgresql.conf', "archive_mode = off"); +$node_standby2->start; +$node_standby1->wait_for_catchup($node_standby2); + +# Promote standby2 while TL2 is writing in segment N+2. The TL2->TL3 switch +# point therefore falls inside segment N+2. +$node_standby2->promote; +$node_standby2->poll_query_until('postgres', 'SELECT NOT pg_is_in_recovery()') + or die "Timed out waiting for standby2 promotion to TL3"; + +$node_primary->stop; +$node_standby1->stop; +$node_standby2->stop; + +# Copy the TL3 history file to the TL1 archive. PostgreSQL builds it with +# the full ancestry chain upon promotion: +# 1 (switch from TL1 to TL2 happened in segment N+1) +# 2 (switch from TL2 to TL3 happened in segment N+2) +# TL2 segment N+1 is intentionally absent from the archive: standby1 had +# archive_mode=off so it never wrote TL2 segments there. Only the TL1 +# version of segment N+1 is present. +my $archive = $node_primary->archive_dir; +copy($node_standby2->data_dir . '/pg_wal/00000003.history', + "$archive/00000003.history") + or die "Could not copy 00000003.history: $!"; + +# Build a recovery node from the TL1 base backup, replaying from the TL1 +# archive only (no streaming). +my $node_rec = PostgreSQL::Test::Cluster->new('recovering'); +$node_rec->init_from_backup($node_primary, 'primary_backup', has_restoring => 1); +$node_rec->enable_restoring($node_primary, 1); +$node_rec->append_conf('postgresql.conf', "recovery_target_timeline = '3'"); +$node_rec->start; + +$node_rec->poll_query_until('postgres', 'SELECT pg_is_in_recovery()', 't') + or die "Node is not in recovery"; + +# With the fix: for segment N+1, TL2 is the first eligible timeline (its +# begin_seg == N+1). TL2 segment N+1 is absent, so recovery stops and does +# not fall back to the TL1 version. +# +# Without the fix (targetBeginSeg approach): targetBeginSeg == N+2 (TL3's +# begin), and segno N+1 < N+2, so the guard does not fire. Recovery falls +# through and silently applies TL1 segment N+1, which carries divergent data. +my $log_content = slurp_file($node_rec->logfile); +unlike( + $log_content, + qr/restored log file "$tl2_switch_seg"/, + "archive recovery did not use TL1 segment at TL2 switch point ($tl2_switch_seg)" +); + +done_testing(); -- 2.51.2