Re: standby recovery fails (tablespace related) (tentative patch and discussion) - Mailing list pgsql-hackers

From Kyotaro Horiguchi
Subject Re: standby recovery fails (tablespace related) (tentative patch and discussion)
Date
Msg-id 20220120.171904.1102923296456789299.horikyota.ntt@gmail.com
Whole thread Raw
In response to Re: standby recovery fails (tablespace related) (tentative patch and discussion)  (Kyotaro Horiguchi <horikyota.ntt@gmail.com>)
Responses Re: standby recovery fails (tablespace related) (tentative patch and discussion)
List pgsql-hackers
At Thu, 20 Jan 2022 15:07:22 +0900 (JST), Kyotaro Horiguchi <horikyota.ntt@gmail.com> wrote in 
> This version works for Unixen but still doesn't for Windows. I'm
> searching for a fix for Windows.

And this version works for Windows.  Maybe I've took a wrong version
to post. dir_readlink manipulated target file (junction) name in the
wrong way.

CI now likes this version for all platforms.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
From 0423d2b9aae0620c07b522632a8074ecd8ffef64 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Date: Thu, 11 Nov 2021 20:42:00 +0900
Subject: [PATCH v15 1/3] Add tablespace support to TAP framework

TAP framework doesn't support nodes that have tablespaces.  Especially
backup and initialization from backups failed if the source node has
tablespaces.  This commit provides simple way to create tablespace
directories and allows backup routines to handle tablespaces.
---
 src/bin/pg_basebackup/t/010_pg_basebackup.pl |   2 +-
 src/test/perl/PostgreSQL/Test/Cluster.pm     | 264 ++++++++++++++++++-
 src/test/perl/PostgreSQL/Test/Utils.pm       |  42 +++
 3 files changed, 305 insertions(+), 3 deletions(-)

diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index f0243f28d4..c139b5e000 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -257,7 +257,7 @@ $node->safe_psql('postgres',
     "CREATE TABLESPACE tblspc1 LOCATION '$realTsDir';");
 $node->safe_psql('postgres',
         "CREATE TABLE test1 (a int) TABLESPACE tblspc1;"
-      . "INSERT INTO test1 VALUES (1234);");
+                 . "INSERT INTO test1 VALUES (1234);");
 $node->backup('tarbackup2', backup_options => ['-Ft']);
 # empty test1, just so that it's different from the to-be-restored data
 $node->safe_psql('postgres', "TRUNCATE TABLE test1;");
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index b7d4c24553..d433ccf610 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -298,6 +298,64 @@ sub archive_dir
 
 =pod
 
+=item $node->tablespace_storage([, nocreate])
+
+Diretory to store tablespace directories.
+If nocreate is true, returns undef if not yet created.
+
+=cut
+
+sub tablespace_storage
+{
+    my ($self, $nocreate) = @_;
+
+    if (!defined $self->{_tsproot})
+    {
+        # tablespace is not used, return undef if nocreate is specified.
+        return undef if ($nocreate);
+
+        # create and remember the tablespae root directotry.
+        $self->{_tsproot} = PostgreSQL::Test::Utils::tempdir_short();
+    }
+
+    return $self->{_tsproot};
+}
+
+=pod
+
+=item $node->tablespaces()
+
+Returns a hash from tablespace OID to tablespace directory name.  For
+example, an oid 16384 pointing to /tmp/jWAhkT_fs0/ts1 is stored as
+$hash{16384} = "ts1".
+
+=cut
+
+sub tablespaces
+{
+    my ($self) = @_;
+    my $pg_tblspc = $self->data_dir . '/pg_tblspc';
+    my %ret;
+
+    # return undef if no tablespace is used
+    return undef if (!defined $self->tablespace_storage(1));
+
+    # collect tablespace entries in pg_tblspc directory
+    opendir(my $dir, $pg_tblspc);
+    while (my $oid = readdir($dir))
+    {
+        next if ($oid !~ /^([0-9]+)$/);
+        my $linkpath = "$pg_tblspc/$oid";
+        my $tsppath = PostgreSQL::Test::Utils::dir_readlink($linkpath);
+        $ret{$oid} = File::Basename::basename($tsppath);
+    }
+    closedir($dir);
+
+    return %ret;
+}
+
+=pod
+
 =item $node->backup_dir()
 
 The output path for backups taken with $node->backup()
@@ -313,6 +371,77 @@ sub backup_dir
 
 =pod
 
+=item $node->backup_tablespace_storage_path(backup_name)
+
+Returns tablespace location path for backup_name.
+Retuns the parent directory if backup_name is not given.
+
+=cut
+
+sub backup_tablespace_storage_path
+{
+    my ($self, $backup_name) = @_;
+    my $dir = $self->backup_dir . '/__tsps';
+
+    $dir .= "/$backup_name" if (defined $backup_name);
+
+    return $dir;
+}
+
+=pod
+
+=item $node->backup_create_tablespace_storage(backup_name)
+
+Create tablespace location directory for backup_name if not yet.
+Create the parent tablespace storage that holds all location
+directories if backup_name is not supplied.
+
+=cut
+
+sub backup_create_tablespace_storage
+{
+    my ($self, $backup_name) = @_;
+    my $dir = $self->backup_tablespace_storage_path($backup_name);
+
+    File::Path::make_path $dir if (! -d $dir);
+}
+
+=pod
+
+=item $node->backup_tablespaces(backup_name)
+
+Returns a reference to hash from tablespace OID to tablespace
+directory name of tablespace directory that the specified backup has.
+For example, an oid 16384 pointing to ../tsps/backup1/ts1 is stored as
+$hash{16384} = "ts1".
+
+=cut
+
+sub backup_tablespaces
+{
+    my ($self, $backup_name) = @_;
+    my $pg_tblspc = $self->backup_dir . '/' . $backup_name . '/pg_tblspc';
+    my %ret;
+
+    #return undef if this backup holds no tablespaces
+    return undef if (! -d $self->backup_tablespace_storage_path($backup_name));
+
+    # scan pg_tblspc directory of the backup
+    opendir(my $dir, $pg_tblspc);
+    while (my $oid = readdir($dir))
+    {
+        next if ($oid !~ /^([0-9]+)$/);
+        my $linkpath = "$pg_tblspc/$oid";
+        my $tsppath = PostgreSQL::Test::Utils::dir_readlink($linkpath);
+        $ret{$oid} = File::Basename::basename($tsppath);
+    }
+    closedir($dir);
+
+    return \%ret;
+}
+
+=pod
+
 =item $node->install_path()
 
 The configured install path (if any) for the node.
@@ -345,6 +474,7 @@ sub info
     print $fh "Data directory: " . $self->data_dir . "\n";
     print $fh "Backup directory: " . $self->backup_dir . "\n";
     print $fh "Archive directory: " . $self->archive_dir . "\n";
+    print $fh "Tablespace directory: " . $self->tablespace_storage . "\n";
     print $fh "Connection string: " . $self->connstr . "\n";
     print $fh "Log file: " . $self->logfile . "\n";
     print $fh "Install Path: ", $self->{_install_path} . "\n"
@@ -575,6 +705,43 @@ sub adjust_conf
 
 =pod
 
+=item $node->new_tablespace(name)
+
+Create a tablespace directory with the name then returns the path.
+
+=cut
+
+sub new_tablespace
+{
+    my ($self, $name) = @_;
+
+    my $path = $self->tablespace_storage . '/' . $name;
+
+    die "tablespace \"$name\" already exists" if (!mkdir($path));
+
+    return $path;
+}
+
+=pod
+
+=item $node->tablespace_dir(name)
+
+Return the path of the existing tablespace with the name.
+
+=cut
+
+sub tablespace_dir
+{
+    my ($self, $name) = @_;
+
+    my $path = $self->tablespace_storage . '/' . $name;
+    return undef if (!-d $path);
+
+    return $path;
+}
+
+=pod
+
 =item $node->backup(backup_name)
 
 Create a hot backup with B<pg_basebackup> in subdirectory B<backup_name> of
@@ -594,9 +761,24 @@ sub backup
     my ($self, $backup_name, %params) = @_;
     my $backup_path = $self->backup_dir . '/' . $backup_name;
     my $name        = $self->name;
+    my @tsp_maps;
 
     local %ENV = $self->_get_env();
 
+    # Build tablespace mappings.  We once let pg_basebackup copy
+    # tablespaces into temporary tablespace storage with a short name
+    # so that we can work on pathnames that fit our tar format which
+    # pg_basebackup depends on.
+    my $map_src_root = $self->tablespace_storage(1);
+    my $backup_tmptsp_root = PostgreSQL::Test::Utils::tempdir_short();
+    my %tsps = $self->tablespaces();
+    foreach my $tspname (values %tsps)
+    {
+        my $src = "$map_src_root/$tspname";
+        my $dst = "$backup_tmptsp_root/$tspname";
+        push(@tsp_maps, "--tablespace-mapping=$src=$dst");
+    }
+
     print "# Taking pg_basebackup $backup_name from node \"$name\"\n";
     PostgreSQL::Test::Utils::system_or_bail(
         'pg_basebackup', '-D',
@@ -604,7 +786,33 @@ sub backup
         $self->host,     '-p',
         $self->port,     '--checkpoint',
         'fast',          '--no-sync',
+        @tsp_maps,
         @{ $params{backup_options} });
+
+    # Move the tablespaces from temporary storage into backup
+    # directory, unless the backup is in tar mode.
+    if (%tsps && ! -f "$backup_path/base.tar")
+    {
+        $self->backup_create_tablespace_storage();
+        PostgreSQL::Test::RecursiveCopy::copypath(
+            $backup_tmptsp_root,
+            $self->backup_tablespace_storage_path($backup_name));
+        # delete the temporary directory right away
+        rmtree $backup_tmptsp_root;
+
+        # Fix tablespace symlinks.  This is not necessarily required
+        # in backups but keep them consistent.
+        my $linkdst_root = "$backup_path/pg_tblspc";
+        my $linksrc_root = $self->backup_tablespace_storage_path($backup_name);
+        foreach my $oid (keys %tsps)
+        {
+            my $tspdst = "$linkdst_root/$oid";
+            my $tspsrc = "$linksrc_root/" . $tsps{$oid};
+            unlink $tspdst;
+            PostgreSQL::Test::Utils::dir_symlink($tspsrc, $tspdst);
+        }
+    }
+
     print "# Backup finished\n";
     return;
 }
@@ -666,11 +874,32 @@ sub _backup_fs
     PostgreSQL::Test::RecursiveCopy::copypath(
         $self->data_dir,
         $backup_path,
+        # Skipping some files and tablespace symlinks
         filterfn => sub {
             my $src = shift;
-            return ($src ne 'log' and $src ne 'postmaster.pid');
+            return ($src ne 'log' and $src ne 'postmaster.pid' and
+                    $src !~ m!^pg_tblspc/[0-9]+$!);
         });
 
+    # Copy tablespaces if any
+    my %tsps = $self->tablespaces();
+    if (%tsps)
+    {
+        $self->backup_create_tablespace_storage();
+        PostgreSQL::Test::RecursiveCopy::copypath(
+            $self->tablespace_storage,
+            $self->backup_tablespace_storage_path($backup_name));
+
+        my $linkdst_root = $backup_path . '/pg_tblspc';
+        my $linksrc_root = $self->backup_tablespace_storage_path($backup_name);
+        foreach my $oid (keys %tsps)
+        {
+            my $tspdst = "$linkdst_root/$oid";
+            my $tspsrc = "$linksrc_root/" . $tsps{$oid};
+            PostgreSQL::Test::Utils::dir_symlink($tspsrc, $tspdst);
+        }
+    }
+
     if ($hot)
     {
 
@@ -754,7 +983,38 @@ sub init_from_backup
     else
     {
         rmdir($data_path);
-        PostgreSQL::Test::RecursiveCopy::copypath($backup_path, $data_path);
+        PostgreSQL::Test::RecursiveCopy::copypath(
+            $backup_path,
+            $data_path,
+            # Skipping tablespace symlinks
+            filterfn => sub {
+                my $src = shift;
+                return ($src !~ m!^pg_tblspc/[0-9]+$!);
+            });
+    }
+
+    # Copy tablespaces if any
+    my $tsps = $root_node->backup_tablespaces($backup_name);
+
+    if ($tsps)
+    {
+        my $tsp_src = $root_node->backup_tablespace_storage_path($backup_name);
+        my $tsp_dst = $self->tablespace_storage();
+        my $linksrc_root = $data_path . '/pg_tblspc';
+
+        # copypath() rejects to copy into existing directory.
+        # Copy individual directories in the storage.
+        foreach my $oid (keys %{$tsps})
+        {
+            my $tsp = ${$tsps}{$oid};
+            my $tspsrc = "$tsp_src/$tsp";
+            my $tspdst = "$tsp_dst/$tsp";
+            PostgreSQL::Test::RecursiveCopy::copypath($tspsrc, $tspdst);
+
+            # Create tablespace symlink for this tablespace
+            my $linkdst = "$linksrc_root/$oid";
+            PostgreSQL::Test::Utils::dir_symlink($tspdst, $linkdst);
+        }
     }
     chmod(0700, $data_path);
 
diff --git a/src/test/perl/PostgreSQL/Test/Utils.pm b/src/test/perl/PostgreSQL/Test/Utils.pm
index 50be10fb5a..e0e5956e9b 100644
--- a/src/test/perl/PostgreSQL/Test/Utils.pm
+++ b/src/test/perl/PostgreSQL/Test/Utils.pm
@@ -724,6 +724,48 @@ sub dir_symlink
 
 =pod
 
+=item dir_readlink(name)
+
+Portably read a symlink for a directory. On Windows this reads a junction
+point. Elsewhere it just calls perl's builtin readlink.
+
+=cut
+
+sub dir_readlink
+{
+    my $name = shift;
+    if ($windows_os)
+    {
+        $name = perl2host($name);
+        $name =~ s,/,\\,g;
+        # Split the path into parent directory and link name
+        die "invalid path spec: $name" if ($name !~ m!^(.*)\\([^\\]+)\\?$!);
+        my ($dir, $fname) = ($1, $2);
+        my $cmd = qq{cmd /c "dir /A:L $dir"};
+        if ($Config{osname} eq 'msys')
+        {
+            # need some indirection on msys
+            $cmd = qq{echo '$cmd' | \$COMSPEC /Q};
+        }
+
+        my $result;
+        foreach my $l (split /[\r\n]+/, `$cmd`)
+        {
+            $result = $1 if ($l =~ m/<JUNCTION>\W+$fname \[(.*)\]/)
+        }
+        die "junction $name not found" if (!defined $result);
+
+        $name =~ s,\\,/,g;
+        return $result;
+    }
+    else
+    {
+        return readlink $name;
+    }
+}
+
+=pod
+
 =back
 
 =head1 Test::More-LIKE METHODS
-- 
2.27.0

From e2772ce12fac4b552f26ad5c1c694766d3429170 Mon Sep 17 00:00:00 2001
From: "apraveen@pivotal.io" <apraveen@pivotal.io>
Date: Thu, 11 Nov 2021 20:46:17 +0900
Subject: [PATCH v15 2/3] Tests to replay create database operation on standby

The tests demonstrate that standby fails to replay a create database
WAL record during crash recovery, if one or more of underlying
directories are missing from the file system.  This can happen if a
drop tablespace or drop database WAL record has been replayed in
archive recovery, before a crash.  And then the create database record
happens to be replayed again during crash recovery.  The failures
indicate bugs that need to be fixed.

The first test, TEST 4, performs several DDL operations resulting in a
database directory being removed, along with a few create database
operations.  It expects crash recovery to succeed because for each
missing directory encountered during create database replay, a matching
drop tablespace or drop database WAL record is found later.

Second test, TEST 5, validates that a standby rightfully aborts replay
during archive recovery, if a missing directory is encountered when
replaying create database WAL record.

These tests have been proposed and implemented in various ways by
Alexandra Wang, Anastasia Lubennikova, Kyotaro Horiguchi, Paul Guo and me.
---
 src/test/recovery/t/011_crash_recovery.pl | 107 +++++++++++++++++++++-
 1 file changed, 106 insertions(+), 1 deletion(-)

diff --git a/src/test/recovery/t/011_crash_recovery.pl b/src/test/recovery/t/011_crash_recovery.pl
index 3892aba3e5..421cf52dfe 100644
--- a/src/test/recovery/t/011_crash_recovery.pl
+++ b/src/test/recovery/t/011_crash_recovery.pl
@@ -11,7 +11,7 @@ use PostgreSQL::Test::Utils;
 use Test::More;
 use Config;
 
-plan tests => 3;
+plan tests => 5;
 
 my $node = PostgreSQL::Test::Cluster->new('primary');
 $node->init(allows_streaming => 1);
@@ -62,3 +62,108 @@ is($node->safe_psql('postgres', qq[SELECT pg_xact_status('$xid');]),
 
 $stdin .= "\\q\n";
 $tx->finish;    # wait for psql to quit gracefully
+
+my $node_primary = PostgreSQL::Test::Cluster->new('primary2');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
+my $dropme_ts_primary1 = $node_primary->new_tablespace('dropme_ts1');
+my $dropme_ts_primary2 = $node_primary->new_tablespace('dropme_ts2');
+my $soruce_ts_primary = $node_primary->new_tablespace('source_ts');
+my $target_ts_primary = $node_primary->new_tablespace('target_ts');
+
+$node_primary->psql('postgres',
+qq[
+    CREATE TABLESPACE dropme_ts1 LOCATION '$dropme_ts_primary1';
+    CREATE TABLESPACE dropme_ts2 LOCATION '$dropme_ts_primary2';
+    CREATE TABLESPACE source_ts  LOCATION '$soruce_ts_primary';
+    CREATE TABLESPACE target_ts  LOCATION '$target_ts_primary';
+    CREATE DATABASE template_db IS_TEMPLATE = true;
+]);
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+my $node_standby = PostgreSQL::Test::Cluster->new('standby2');
+$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1);
+$node_standby->start;
+
+# Make sure connection is made
+$node_primary->poll_query_until(
+    'postgres', 'SELECT count(*) = 1 FROM pg_stat_replication');
+
+$node_standby->safe_psql('postgres', 'CHECKPOINT');
+
+# Do immediate shutdown just after a sequence of CREAT DATABASE / DROP
+# DATABASE / DROP TABLESPACE. This causes CREATE DATABASE WAL records
+# to be applied to already-removed directories.
+$node_primary->safe_psql('postgres',
+                        q[CREATE DATABASE dropme_db1 WITH TABLESPACE dropme_ts1;
+                          CREATE DATABASE dropme_db2 WITH TABLESPACE dropme_ts2;
+                          CREATE DATABASE moveme_db TABLESPACE source_ts;
+                          ALTER DATABASE moveme_db SET TABLESPACE target_ts;
+                          CREATE DATABASE newdb TEMPLATE template_db;
+                          ALTER DATABASE template_db IS_TEMPLATE = false;
+                          DROP DATABASE dropme_db1;
+                          DROP DATABASE dropme_db2; DROP TABLESPACE dropme_ts2;
+                          DROP TABLESPACE source_ts;
+                          DROP DATABASE template_db;]);
+
+$node_primary->wait_for_catchup($node_standby, 'replay',
+                               $node_primary->lsn('replay'));
+$node_standby->stop('immediate');
+
+# Should restart ignoring directory creation error.
+is($node_standby->start(fail_ok => 1), 1);
+
+
+# TEST 5
+#
+# Ensure that a missing tablespace directory during create database
+# replay immediately causes panic if the standby has already reached
+# consistent state (archive recovery is in progress).
+
+$node_primary = PostgreSQL::Test::Cluster->new('primary3');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
+
+# Create tablespace
+my $ts_primary = $node_primary->new_tablespace('dropme_ts1');
+$node_primary->safe_psql('postgres',
+                         "CREATE TABLESPACE ts1 LOCATION '$ts_primary'");
+$node_primary->safe_psql('postgres', "CREATE DATABASE db1 TABLESPACE ts1");
+
+# Take backup
+$backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+$node_standby = PostgreSQL::Test::Cluster->new('standby3');
+$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1);
+$node_standby->start;
+
+# Make sure standby reached consistency and starts accepting connections
+$node_standby->poll_query_until('postgres', 'SELECT 1', '1');
+
+# Remove standby tablespace directory so it will be missing when
+# replay resumes.
+File::Path::rmtree($node_standby->tablespace_dir('dropme_ts1'));
+
+# Create a database in the tablespace and a table in default tablespace
+$node_primary->safe_psql('postgres',
+                        q[CREATE TABLE should_not_replay_insertion(a int);
+                          CREATE DATABASE db2 WITH TABLESPACE ts1;
+                          INSERT INTO should_not_replay_insertion VALUES (1);]);
+
+# Standby should fail and should not silently skip replaying the wal
+if ($node_primary->poll_query_until(
+        'postgres',
+        'SELECT count(*) = 0 FROM pg_stat_replication',
+        't') == 1)
+{
+    pass('standby failed as expected');
+    # We know that the standby has failed.  Setting its pid to
+    # undefined avoids error when PostgreNode module tries to stop the
+    # standby node as part of tear_down sequence.
+    $node_standby->{_pid} = undef;
+}
+else
+{
+    fail('standby did not fail within 5 seconds');
+}
-- 
2.27.0

From ad9266ffdf4fdfc850218d6bb558127a2753f05c Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Thu, 9 Jan 2020 17:54:40 -0300
Subject: [PATCH v15 3/3] Fix replay of create database records on standby

Crash recovery on standby may encounter missing directories when
replaying create database WAL records.  Prior to this patch, the
standby would fail to recover in such a case.  However, the
directories could be legitimately missing.  Consider a sequence of WAL
records as follows:

    CREATE DATABASE
    DROP DATABASE
    DROP TABLESPACE

If, after replaying the last WAL record and removing the tablespace
directory, the standby crashes and has to replay the create database
record again, the crash recovery must be able to move on.

This patch adds mechanism similar to invalid page hash table, to track
missing directories during crash recovery.  If all the missing
directory references are matched with corresponding drop records at
the end of crash recovery, the standby can safely enter archive
recovery.

Bug identified by Paul Guo.

Authored by Paul Guo, Kyotaro Horiguchi and Asim R P.
---
 src/backend/access/transam/xlog.c      |   6 +
 src/backend/access/transam/xlogutils.c | 145 +++++++++++++++++++++++++
 src/backend/commands/dbcommands.c      |  55 ++++++++++
 src/backend/commands/tablespace.c      |   5 +
 src/include/access/xlogutils.h         |   4 +
 5 files changed, 215 insertions(+)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c9d4cbf3ff..ec279c6158 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8314,6 +8314,12 @@ CheckRecoveryConsistency(void)
          */
         XLogCheckInvalidPages();
 
+        /*
+         * Check if the XLOG sequence contained any unresolved references to
+         * missing directories.
+         */
+        XLogCheckMissingDirs();
+
         reachedConsistency = true;
         ereport(LOG,
                 (errmsg("consistent recovery state reached at %X/%X",
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 90e1c48390..cd00e0f01e 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -79,6 +79,151 @@ typedef struct xl_invalid_page
 
 static HTAB *invalid_page_tab = NULL;
 
+/*
+ * If a create database WAL record is being replayed more than once during
+ * crash recovery on a standby, it is possible that either the tablespace
+ * directory or the template database directory is missing.  This happens when
+ * the directories are removed by replay of subsequent drop records.  Note
+ * that this problem happens only on standby and not on master.  On master, a
+ * checkpoint is created at the end of create database operation. On standby,
+ * however, such a strategy (creating restart points during replay) is not
+ * viable because it will slow down WAL replay.
+ *
+ * The alternative is to track references to each missing directory
+ * encountered when performing crash recovery in the following hash table.
+ * Similar to invalid page table above, the expectation is that each missing
+ * directory entry should be matched with a drop database or drop tablespace
+ * WAL record by the end of crash recovery.
+ */
+typedef struct xl_missing_dir_key
+{
+    Oid spcNode;
+    Oid dbNode;
+} xl_missing_dir_key;
+
+typedef struct xl_missing_dir
+{
+    xl_missing_dir_key key;
+    char path[MAXPGPATH];
+} xl_missing_dir;
+
+static HTAB *missing_dir_tab = NULL;
+
+void
+XLogReportMissingDir(Oid spcNode, Oid dbNode, char *path)
+{
+    xl_missing_dir_key key;
+    bool found;
+    xl_missing_dir *entry;
+
+    /*
+     * Database OID may be invalid but tablespace OID must be valid.  If
+     * dbNode is InvalidOid, we are logging a missing tablespace directory,
+     * otherwise we are logging a missing database directory.
+     */
+    Assert(OidIsValid(spcNode));
+
+    if (missing_dir_tab == NULL)
+    {
+        /* create hash table when first needed */
+        HASHCTL        ctl;
+
+        memset(&ctl, 0, sizeof(ctl));
+        ctl.keysize = sizeof(xl_missing_dir_key);
+        ctl.entrysize = sizeof(xl_missing_dir);
+
+        missing_dir_tab = hash_create("XLOG missing directory table",
+                                       100,
+                                       &ctl,
+                                       HASH_ELEM | HASH_BLOBS);
+    }
+
+    key.spcNode = spcNode;
+    key.dbNode = dbNode;
+
+    entry = hash_search(missing_dir_tab, &key, HASH_ENTER, &found);
+
+    if (found)
+    {
+        if (dbNode == InvalidOid)
+            elog(DEBUG2, "missing directory %s (tablespace %d) already exists: %s",
+                 path, spcNode, entry->path);
+        else
+            elog(DEBUG2, "missing directory %s (tablespace %d database %d) already exists: %s",
+                 path, spcNode, dbNode, entry->path);
+    }
+    else
+    {
+        strlcpy(entry->path, path, sizeof(entry->path));
+        if (dbNode == InvalidOid)
+            elog(DEBUG2, "logged missing dir %s (tablespace %d)",
+                 path, spcNode);
+        else
+            elog(DEBUG2, "logged missing dir %s (tablespace %d database %d)",
+                 path, spcNode, dbNode);
+    }
+}
+
+void
+XLogForgetMissingDir(Oid spcNode, Oid dbNode)
+{
+    xl_missing_dir_key key;
+
+    key.spcNode = spcNode;
+    key.dbNode = dbNode;
+
+    /* Database OID may be invalid but tablespace OID must be valid. */
+    Assert(OidIsValid(spcNode));
+
+    if (missing_dir_tab == NULL)
+        return;
+
+    if (hash_search(missing_dir_tab, &key, HASH_REMOVE, NULL) != NULL)
+    {
+        if (dbNode == InvalidOid)
+        {
+            elog(DEBUG2, "forgot missing dir (tablespace %d)", spcNode);
+        }
+        else
+        {
+            char *path = GetDatabasePath(dbNode, spcNode);
+
+            elog(DEBUG2, "forgot missing dir %s (tablespace %d database %d)",
+                 path, spcNode, dbNode);
+            pfree(path);
+        }
+    }
+}
+
+/*
+ * This is called at the end of crash recovery, before entering archive
+ * recovery on a standby.  PANIC if the hash table is not empty.
+ */
+void
+XLogCheckMissingDirs(void)
+{
+    HASH_SEQ_STATUS status;
+    xl_missing_dir *hentry;
+    bool        foundone = false;
+
+    if (missing_dir_tab == NULL)
+        return;                    /* nothing to do */
+
+    hash_seq_init(&status, missing_dir_tab);
+
+    while ((hentry = (xl_missing_dir *) hash_seq_search(&status)) != NULL)
+    {
+        elog(WARNING, "missing directory \"%s\" tablespace %d database %d",
+             hentry->path, hentry->key.spcNode, hentry->key.dbNode);
+        foundone = true;
+    }
+
+    if (foundone)
+        elog(PANIC, "WAL contains references to missing directories");
+
+    hash_destroy(missing_dir_tab);
+    missing_dir_tab = NULL;
+}
 
 /* Report a reference to an invalid page */
 static void
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 509d1a3e92..02b080e4ef 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -2143,7 +2143,9 @@ dbase_redo(XLogReaderState *record)
         xl_dbase_create_rec *xlrec = (xl_dbase_create_rec *) XLogRecGetData(record);
         char       *src_path;
         char       *dst_path;
+        char       *parent_path;
         struct stat st;
+        bool        skip = false;
 
         src_path = GetDatabasePath(xlrec->src_db_id, xlrec->src_tablespace_id);
         dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
@@ -2161,6 +2163,55 @@ dbase_redo(XLogReaderState *record)
                         (errmsg("some useless files may be left behind in old database directory \"%s\"",
                                 dst_path)));
         }
+        else if (!reachedConsistency)
+        {
+            /*
+             * It is possible that drop tablespace record appearing later in
+             * the WAL as already been replayed.  That means we are replaying
+             * the create database record second time, as part of crash
+             * recovery.  In that case, the tablespace directory has already
+             * been removed and the create database operation cannot be
+             * replayed.  We should skip the replay but remember the missing
+             * tablespace directory, to be matched with a drop tablespace
+             * record later.
+             */
+            parent_path = pstrdup(dst_path);
+            get_parent_directory(parent_path);
+            if (!(stat(parent_path, &st) == 0 && S_ISDIR(st.st_mode)))
+            {
+                XLogReportMissingDir(xlrec->tablespace_id, InvalidOid, parent_path);
+                skip = true;
+                ereport(WARNING,
+                        (errmsg("skipping create database WAL record"),
+                         errdetail("Target tablespace \"%s\" not found. We "
+                                   "expect to encounter a WAL record that "
+                                   "removes this directory before reaching "
+                                   "consistent state.", parent_path)));
+            }
+            pfree(parent_path);
+        }
+
+        /*
+         * Source directory may be missing.  E.g. the template database used
+         * for creating this database may have been dropped, due to reasons
+         * noted above.  Moving a database from one tablespace may also be a
+         * partner in the crime.
+         */
+        if (!(stat(src_path, &st) == 0 && S_ISDIR(st.st_mode)) &&
+            !reachedConsistency)
+        {
+            XLogReportMissingDir(xlrec->src_tablespace_id, xlrec->src_db_id, src_path);
+            skip = true;
+            ereport(WARNING,
+                    (errmsg("skipping create database WAL record"),
+                     errdetail("Source database \"%s\" not found. We expect "
+                               "to encounter a WAL record that removes this "
+                               "directory before reaching consistent state.",
+                               src_path)));
+        }
+
+        if (skip)
+            return;
 
         /*
          * Force dirty buffers out to disk, to ensure source database is
@@ -2218,6 +2269,10 @@ dbase_redo(XLogReaderState *record)
                 ereport(WARNING,
                         (errmsg("some useless files may be left behind in old database directory \"%s\"",
                                 dst_path)));
+
+            if (!reachedConsistency)
+                XLogForgetMissingDir(xlrec->tablespace_ids[i], xlrec->db_id);
+
             pfree(dst_path);
         }
 
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index b2ccf5e06e..b2975a0bd2 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -1565,6 +1565,11 @@ tblspc_redo(XLogReaderState *record)
     {
         xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
 
+        if (!reachedConsistency)
+            XLogForgetMissingDir(xlrec->ts_id, InvalidOid);
+
+        XLogFlush(record->EndRecPtr);
+
         /*
          * If we issued a WAL record for a drop tablespace it implies that
          * there were no files in it at all when the DROP was done. That means
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 64708949db..5d9c20cae7 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -65,6 +65,10 @@ extern void XLogDropDatabase(Oid dbid);
 extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
                                  BlockNumber nblocks);
 
+extern void XLogReportMissingDir(Oid spcNode, Oid dbNode, char *path);
+extern void XLogForgetMissingDir(Oid spcNode, Oid dbNode);
+extern void XLogCheckMissingDirs(void);
+
 /* Result codes for XLogReadBufferForRedo[Extended] */
 typedef enum
 {
-- 
2.27.0


pgsql-hackers by date:

Previous
From: "tanghy.fnst@fujitsu.com"
Date:
Subject: RE: Support tab completion for upper character inputs in psql
Next
From: "kuroda.hayato@fujitsu.com"
Date:
Subject: RE: [Proposal] Add foreign-server health checks infrastructure