From 80c9db24c4937e1782703c123ab67d74d8b20690 Mon Sep 17 00:00:00 2001 From: Nitin Jadhav Date: Fri, 6 Mar 2026 04:52:17 +0000 Subject: [PATCH 2/2] Add TAP tests for missing redo/checkpoint during backup recovery Add two recovery TAP tests to validate PostgreSQL behavior when WAL records required for startup are missing in the presence of a backup_label file. The first test covers the case where the checkpoint record referenced by backup_label is missing, and verifies that recovery fails with a clear FATAL error. The second test covers the case where the redo record referenced by the checkpoint is missing while a backup_label file is present, with redo and checkpoint records forced into different WAL segments using injection points. --- ...53_missing_checkpoint_with_backup_label.pl | 86 ++++++++++ .../t/054_missing_redo_with_backup_label.pl | 152 ++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 src/test/recovery/t/053_missing_checkpoint_with_backup_label.pl create mode 100644 src/test/recovery/t/054_missing_redo_with_backup_label.pl diff --git a/src/test/recovery/t/053_missing_checkpoint_with_backup_label.pl b/src/test/recovery/t/053_missing_checkpoint_with_backup_label.pl new file mode 100644 index 00000000000..7be070abfea --- /dev/null +++ b/src/test/recovery/t/053_missing_checkpoint_with_backup_label.pl @@ -0,0 +1,86 @@ +# Copyright (c) 2025-2026, PostgreSQL Global Development Group +# +# Verify crash recovery behavior when the WAL segment containing the +# checkpoint record referenced by backup_label is missing. +# +# Expected behavior: startup fails with FATAL and logs a message about +# not being able to locate a valid checkpoint record. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node = PostgreSQL::Test::Cluster->new('testnode'); +$node->init(allows_streaming => 1); +$node->append_conf('postgresql.conf', 'wal_level = replica'); +$node->start; + +# Generate WAL and force a checkpoint +$node->safe_psql('postgres', + q{CREATE TABLE t(a int); INSERT INTO t VALUES (1),(2),(3);}); +$node->safe_psql('postgres', 'CHECKPOINT'); + +# Take a physical base backup (creates backup_label) +my $backupname = 'fs_bkp'; +$node->backup($backupname); + +my $reco = PostgreSQL::Test::Cluster->new('recovery_from_backup_ckpt'); +$reco->init_from_backup( + $node, + $backupname, + has_restored => 1, +); + +# Assert backup_label exists +my $backup_label = $reco->data_dir . '/backup_label'; +ok( + -e $backup_label, + 'backup_label exists' +); + +# Determine WAL file containing the checkpoint record +my $backup_label_path = $reco->data_dir . '/backup_label'; +my $backup_label_contents = slurp_file($backup_label_path); + +my ($checkpoint_walfile) = + $backup_label_contents =~ + /\(file\s+([0-9A-F]{24})\)/; +ok( + defined $checkpoint_walfile, + "extracted checkpoint WAL file from backup_label: $checkpoint_walfile" +); + +# Remove the WAL segment containing the checkpoint record +my $pgwal = $reco->data_dir . '/pg_wal'; +ok(-d $pgwal, 'pg_wal directory exists'); + +my $target = "$pgwal/$checkpoint_walfile"; +ok( + -e $target, + "checkpoint WAL segment exists before removal: $target" +) or die "Expected WAL segment $target not found"; + +unlink($target) + or die "unlink $target failed: $!"; + +# Start the server and confirm that recovery has failed, as expected. +command_fails( + [ + 'pg_ctl', + '--pgdata' => $reco->data_dir, + '--log' => $reco->logfile, + 'start', + ], + 'startup fails when checkpoint WAL is missing with backup_label present' +); + +my $log = slurp_file($reco->logfile); +like( + $log, + qr/(?:FATAL|PANIC): .*could not locate required checkpoin record/i, + 'server log reports missing checkpoint record' +); + +done_testing(); \ No newline at end of file diff --git a/src/test/recovery/t/054_missing_redo_with_backup_label.pl b/src/test/recovery/t/054_missing_redo_with_backup_label.pl new file mode 100644 index 00000000000..f2d3352c46f --- /dev/null +++ b/src/test/recovery/t/054_missing_redo_with_backup_label.pl @@ -0,0 +1,152 @@ +# Copyright (c) 2025-2026, PostgreSQL Global Development Group +# +# Verify recovery behavior when a WAL segment containing the redo record is +# missing, with a checkpoint record located in a different segment, in the +# presence of a backup_label file. +# +# Expected behavior: startup fails with FATAL and logs a message about not +# being able to find the redo location referenced by the checkpoint record. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node = PostgreSQL::Test::Cluster->new('testnode'); +$node->init; +$node->append_conf('postgresql.conf', 'log_checkpoints = on'); +$node->start; + +# Check if the extension injection_points is available. +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} +$node->safe_psql('postgres', q(CREATE EXTENSION injection_points)); + +# Note that this uses two injection points based on waits, not one. This +# may look strange, but this works as a workaround to enforce all memory +# allocations to happen outside the critical section of the checkpoint +# required for this test. +# First, "create-checkpoint-initial" is run outside the critical section +# section, and is used as a way to initialize the shared memory required +# for the wait machinery with its DSM registry. +# Then, "create-checkpoint-run" is loaded outside the critical section of +# a checkpoint to allocate any memory required by the library load, and +# its callback is run inside the critical section. +$node->safe_psql('postgres', + q{SELECT injection_points_attach('create-checkpoint-initial', 'wait')}); +$node->safe_psql('postgres', + q{SELECT injection_points_attach('create-checkpoint-run', 'wait')}); + +# Start a psql session to run the checkpoint in the background and make +# the test wait on the injection point so the checkpoint stops just after +# it starts. +my $checkpoint = $node->background_psql('postgres'); +$checkpoint->query_until( + qr/starting_checkpoint/, + q(\echo starting_checkpoint +checkpoint; +)); + +# Wait for the initial point to finish, the checkpointer is still +# outside its critical section. Then release to reach the second +# point. +$node->wait_for_event('checkpointer', 'create-checkpoint-initial'); +$node->safe_psql('postgres', + q{SELECT injection_points_wakeup('create-checkpoint-initial')}); + +# Wait until the checkpoint has reached the second injection point. +# We are now in the middle of a checkpoint running, after the redo +# record has been logged. +$node->wait_for_event('checkpointer', 'create-checkpoint-run'); + +# Switch WAL segment to ensure redo and checkpoint records are in different +# segments. +$node->safe_psql('postgres', 'SELECT pg_switch_wal()'); + +# Continue checkpoint and wait for completion. +my $log_offset = -s $node->logfile; +$node->safe_psql('postgres', + q{SELECT injection_points_wakeup('create-checkpoint-run')}); +$node->wait_for_log(qr/checkpoint complete/, $log_offset); + +$checkpoint->quit; + +# Retrieve the WAL file names for the redo record and checkpoint record. +my $redo_lsn = $node->safe_psql('postgres', + q{SELECT redo_lsn FROM pg_control_checkpoint()}); +my $checkpoint_lsn = $node->safe_psql('postgres', + q{SELECT checkpoint_lsn FROM pg_control_checkpoint()}); +my $redo_walfile_name = + $node->safe_psql('postgres', "SELECT pg_walfile_name('$redo_lsn')"); +my $checkpoint_walfile_name = + $node->safe_psql('postgres', "SELECT pg_walfile_name('$checkpoint_lsn')"); + +# Redo record and checkpoint record should be on different segments. +isnt($redo_walfile_name, $checkpoint_walfile_name, + 'redo and checkpoint records on different segments'); + +# Stop and take a cold filesystem backup of the stopped server. +$node->stop('immediate'); +my $backupname = 'cold_bkp'; +$node->backup_fs_cold($backupname); + +# Restore cold backup into a new node. +my $reco = PostgreSQL::Test::Cluster->new('reco_with_backup_label'); +$reco->init_from_backup($node, $backupname, has_restored => 1); + +# Manually create backup_label in restored cluster to force backup recovery. +my $backup_label_path = $reco->data_dir . '/backup_label'; + +# Extract timeline from WAL filename (first 8 hex digits). +my $tli_hex = substr($checkpoint_walfile_name, 0, 8); +my $tli = hex($tli_hex); + +open(my $bl, '>', $backup_label_path) + or die "could not create backup_label: $!"; +print $bl "START WAL LOCATION: $redo_lsn (file $redo_walfile_name)\n"; +print $bl "CHECKPOINT LOCATION: $checkpoint_lsn\n"; +print $bl "BACKUP METHOD: test\n"; +print $bl "BACKUP FROM: primary\n"; +print $bl "START TIMELINE: $tli\n"; +print $bl "CHECKPOINT TIMELINE: $tli\n"; +print $bl "LABEL: redo missing with backup_label\n"; +close($bl); + +ok(-e $backup_label_path, 'backup_label exists before startup'); + +# Remove the WAL segment containing the redo record. +my $redo_path = $reco->data_dir . "/pg_wal/$redo_walfile_name"; +my $ckpt_path = $reco->data_dir . "/pg_wal/$checkpoint_walfile_name"; + +ok(-e $ckpt_path, "checkpoint WAL segment exists: $ckpt_path"); +ok(-e $redo_path, "redo WAL segment exists before removal: $redo_path") + or die "Expected WAL segment $redo_path not found"; + +unlink($redo_path) + or die "could not remove redo WAL file: $!"; + +# Use run_log instead of node->start because this test expects that +# the server ends with an error during recovery. +run_log( + [ + 'pg_ctl', + '--pgdata' => $reco->data_dir, + '--log' => $reco->logfile, + 'start', + ]); + +# Confirm that recovery has failed, as expected. +my $logfile = slurp_file($reco->logfile()); +ok( $logfile =~ + qr/FATAL: .* could not find redo location .* referenced by checkpoint record at .*/, + "ends with FATAL because it could not find redo location"); + +done_testing(); \ No newline at end of file -- 2.43.0