Thread: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

crash-recovery replay of CREATE TABLESPACE is broken in HEAD

From
Tom Lane
Date:
I managed to crash the executor in the tablespace.sql test while working
on a 9.1 patch, and discovered that the postmaster fails to recover
from that.  The end of postmaster.log looks like

LOG:  all server processes terminated; reinitializing
LOG:  database system was interrupted; last known up at 2010-07-11 19:30:07 EDT
LOG:  database system was not properly shut down; automatic recovery in progress
LOG:  consistent recovery state reached at 0/EE26F30
LOG:  redo starts at 0/EE26F30
FATAL:  directory "/home/postgres/pgsql/src/test/regress/testtablespace/PG_9.1_201004261" already in use as a
tablespace
CONTEXT:  xlog redo create ts: 127158 "/home/postgres/pgsql/src/test/regress/testtablespace"
LOG:  startup process (PID 13914) exited with exit code 1
LOG:  aborting startup due to startup process failure

It looks to me like those well-intentioned recent changes in this area
broke the crash-recovery case.  Not good.
        regards, tom lane


Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

From
Bruce Momjian
Date:
Tom Lane wrote:
> I managed to crash the executor in the tablespace.sql test while working
> on a 9.1 patch, and discovered that the postmaster fails to recover
> from that.  The end of postmaster.log looks like
>
> LOG:  all server processes terminated; reinitializing
> LOG:  database system was interrupted; last known up at 2010-07-11 19:30:07 EDT
> LOG:  database system was not properly shut down; automatic recovery in progress
> LOG:  consistent recovery state reached at 0/EE26F30
> LOG:  redo starts at 0/EE26F30
> FATAL:  directory "/home/postgres/pgsql/src/test/regress/testtablespace/PG_9.1_201004261" already in use as a
tablespace
> CONTEXT:  xlog redo create ts: 127158 "/home/postgres/pgsql/src/test/regress/testtablespace"
> LOG:  startup process (PID 13914) exited with exit code 1
> LOG:  aborting startup due to startup process failure
>
> It looks to me like those well-intentioned recent changes in this area
> broke the crash-recovery case.  Not good.

Sorry for the delay.  I didn't realize this was my code that was broken
until Heikki told me via IM.

The bug is that we can't replay mkdir()/symlink() and assume those will
always succeed.  I looked at the createdb redo code and it basically
drops the directory before creating it.

The tablespace directory/symlink setup is more complex, so I just wrote
the attached patch to trigger a redo-'delete' tablespace operation
before the create tablespace redo operation.

Ignoring mkdir/symlink creation failure is not an option because the
symlink might point to some wrong location or something.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + None of us is going to be here forever. +
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.77
diff -c -c -r1.77 tablespace.c
*** src/backend/commands/tablespace.c    18 Jul 2010 04:47:46 -0000    1.77
--- src/backend/commands/tablespace.c    18 Jul 2010 05:17:23 -0000
***************
*** 1355,1368 ****
      /* Backup blocks are not used in tblspc records */
      Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));

!     if (info == XLOG_TBLSPC_CREATE)
!     {
!         xl_tblspc_create_rec *xlrec = (xl_tblspc_create_rec *) XLogRecGetData(record);
!         char       *location = xlrec->ts_path;
!
!         create_tablespace_directories(location, xlrec->ts_id);
!     }
!     else if (info == XLOG_TBLSPC_DROP)
      {
          xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);

--- 1355,1365 ----
      /* Backup blocks are not used in tblspc records */
      Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));

!     /*
!      *    If we are creating a tablespace during recovery, it is unclear
!      *    what state it is in, so potentially remove it before creating it.
!      */
!     if (info == XLOG_TBLSPC_DROP || info == XLOG_TBLSPC_CREATE)
      {
          xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);

***************
*** 1395,1400 ****
--- 1392,1407 ----
      }
      else
          elog(PANIC, "tblspc_redo: unknown op code %u", info);
+
+     /* Now create the tablespace we perhaps just removed. */
+     if (info == XLOG_TBLSPC_CREATE)
+     {
+         xl_tblspc_create_rec *xlrec = (xl_tblspc_create_rec *) XLogRecGetData(record);
+         char       *location = xlrec->ts_path;
+
+         create_tablespace_directories(location, xlrec->ts_id);
+     }
+
  }

  void

Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

From
Heikki Linnakangas
Date:
On 18/07/10 08:22, Bruce Momjian wrote:
> The bug is that we can't replay mkdir()/symlink() and assume those will
> always succeed.  I looked at the createdb redo code and it basically
> drops the directory before creating it.
>
> The tablespace directory/symlink setup is more complex, so I just wrote
> the attached patch to trigger a redo-'delete' tablespace operation
> before the create tablespace redo operation.

Redoing a drop talespace assumes the tablespace directory is empty, 
which it necessarily isn't when redoing a create tablespace command:

postgres=# CREATE TABLESPACE t LOCATION '/tmp/t';
CREATE TABLESPACE
postgres=#  CREATE TABLE tfoo (id int4) TABLESPACE t;
CREATE TABLE
postgres=# \q
$ killall -9 postmaster
$ bin/postmaster -D data
LOG:  database system was interrupted; last known up at 2010-07-18 
08:48:32 EEST
LOG:  database system was not properly shut down; automatic recovery in 
progress
LOG:  consistent recovery state reached at 0/5E889C
LOG:  redo starts at 0/5E889C
FATAL:  tablespace 16402 is not empty
CONTEXT:  xlog redo create ts: 16402 "/tmp/t"
LOG:  startup process (PID 5987) exited with exit code 1
LOG:  aborting startup due to startup process failure

Also, casting the xl_tblspc_create_rec as xl_tblspc_drop_rec is a bit 
questionable. It works because both structs begin with the tablespace 
Oid, but it doesn't look right, and can break in hard-to-notice ways in 
the future if the structure of those structs change in the future.

> Ignoring mkdir/symlink creation failure is not an option because the
> symlink might point to some wrong location or something.

Maybe you should check that it points to the right location? Or drop and 
recreate the symlink, and ignore failure at mkdir.

--   Heikki Linnakangas  EnterpriseDB   http://www.enterprisedb.com


Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

From
Tom Lane
Date:
Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:
> Maybe you should check that it points to the right location? Or drop and 
> recreate the symlink, and ignore failure at mkdir.

More specifically, ignore EEXIST failure when replaying mkdir.  Anything
else is still a problem.
        regards, tom lane


Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

From
Bruce Momjian
Date:
Tom Lane wrote:
> Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:
> > Maybe you should check that it points to the right location? Or drop and
> > recreate the symlink, and ignore failure at mkdir.
>
> More specifically, ignore EEXIST failure when replaying mkdir.  Anything
> else is still a problem.

Thanks for the help.  I tried to find somewhere else in our recovery
code that was similar but didn't find anything.

The attached patch does as suggested.  I added the recovery code to the
create tablespace function so I didn't have to duplicate all the code
that computes the path names.

Attached.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + None of us is going to be here forever. +
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.77
diff -c -c -r1.77 tablespace.c
*** src/backend/commands/tablespace.c    18 Jul 2010 04:47:46 -0000    1.77
--- src/backend/commands/tablespace.c    18 Jul 2010 15:53:34 -0000
***************
*** 568,578 ****
       */
      if (mkdir(location_with_version_dir, S_IRWXU) < 0)
      {
          if (errno == EEXIST)
!             ereport(ERROR,
!                     (errcode(ERRCODE_OBJECT_IN_USE),
!                      errmsg("directory \"%s\" already in use as a tablespace",
!                             location_with_version_dir)));
          else
              ereport(ERROR,
                      (errcode_for_file_access(),
--- 568,582 ----
       */
      if (mkdir(location_with_version_dir, S_IRWXU) < 0)
      {
+         /* In recovery, directory might already exists, which is OK */
          if (errno == EEXIST)
!         {
!             if (!InRecovery)
!                 ereport(ERROR,
!                         (errcode(ERRCODE_OBJECT_IN_USE),
!                          errmsg("directory \"%s\" already in use as a tablespace",
!                                 location_with_version_dir)));
!         }
          else
              ereport(ERROR,
                      (errcode_for_file_access(),
***************
*** 580,585 ****
--- 584,599 ----
                              location_with_version_dir)));
      }

+     /* Remove old symlink in recovery, in case it points to the wrong place */
+     if (InRecovery)
+     {
+         if (unlink(linkloc) < 0 && errno != ENOENT)
+             ereport(ERROR,
+                     (errcode_for_file_access(),
+                      errmsg("could not remove symbolic link \"%s\": %m",
+                             linkloc)));
+     }
+
      /*
       * Create the symlink under PGDATA
       */

Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

From
Bruce Momjian
Date:
Bruce Momjian wrote:
> Tom Lane wrote:
> > Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:
> > > Maybe you should check that it points to the right location? Or drop and 
> > > recreate the symlink, and ignore failure at mkdir.
> > 
> > More specifically, ignore EEXIST failure when replaying mkdir.  Anything
> > else is still a problem.
> 
> Thanks for the help.  I tried to find somewhere else in our recovery
> code that was similar but didn't find anything.
> 
> The attached patch does as suggested.  I added the recovery code to the
> create tablespace function so I didn't have to duplicate all the code
> that computes the path names.
> 
> Attached.

Uh, another question.  Looking at the createdb recovery, I see:
       /*        * Our theory for replaying a CREATE is to forcibly drop the target        * subdirectory if present,
thenre-copy the source data. This may be        * more work than needed, but it is simple to implement.        */
if(stat(dst_path, &st) == 0 && S_ISDIR(st.st_mode))       {           if (!rmtree(dst_path, true))
ereport(WARNING,                      (errmsg("some useless files may be left behind in old database directory \"%s\"",
                             dst_path)));       }
 

Should I be using rmtree() on the mkdir target?

Also, the original tablespace recovery code did not drop the symlink
first.  I assume that was not a bug only because we don't support moving
tablespaces:

-               /* Create the symlink if not already present */
-               linkloc = (char *) palloc(OIDCHARS + OIDCHARS + 1);
-               sprintf(linkloc, "pg_tblspc/%u", xlrec->ts_id);
-
-               if (symlink(location, linkloc) < 0)
-               {
-                       if (errno != EEXIST)
-                               ereport(ERROR,
-                                               (errcode_for_file_access(),
-                                                errmsg("could not create symbolic link \"%s\": %m",
-                                                               linkloc)));
-               }

Still, it seems logical to unlink it before creating it.

--  Bruce Momjian  <bruce@momjian.us>        http://momjian.us EnterpriseDB
http://enterprisedb.com
 + None of us is going to be here forever. +


Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

From
Bruce Momjian
Date:
Bruce Momjian wrote:
> > The attached patch does as suggested.  I added the recovery code to the
> > create tablespace function so I didn't have to duplicate all the code
> > that computes the path names.
> >
> > Attached.
>
> Uh, another question.  Looking at the createdb recovery, I see:
>
>         /*
>          * Our theory for replaying a CREATE is to forcibly drop the target
>          * subdirectory if present, then re-copy the source data. This may be
>          * more work than needed, but it is simple to implement.
>          */
>         if (stat(dst_path, &st) == 0 && S_ISDIR(st.st_mode))
>         {
>             if (!rmtree(dst_path, true))
>                 ereport(WARNING,
>                         (errmsg("some useless files may be left behind in old database directory \"%s\"",
>                                 dst_path)));
>         }
>
> Should I be using rmtree() on the mkdir target?
>
> Also, the original tablespace recovery code did not drop the symlink
> first.  I assume that was not a bug only because we don't support moving
> tablespaces:

For consistency with CREATE DATABASE recovery and for reliablity, I
coded the rmtree() call instead.  Patch attached.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + None of us is going to be here forever. +
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.77
diff -c -c -r1.77 tablespace.c
*** src/backend/commands/tablespace.c    18 Jul 2010 04:47:46 -0000    1.77
--- src/backend/commands/tablespace.c    19 Jul 2010 04:59:03 -0000
***************
*** 538,543 ****
--- 538,544 ----
      char       *linkloc = palloc(OIDCHARS + OIDCHARS + 1);
      char       *location_with_version_dir = palloc(strlen(location) + 1 +
                                     strlen(TABLESPACE_VERSION_DIRECTORY) + 1);
+     struct stat st;

      sprintf(linkloc, "pg_tblspc/%u", tablespaceoid);
      sprintf(location_with_version_dir, "%s/%s", location,
***************
*** 562,567 ****
--- 563,584 ----
                           location)));
      }

+     if (InRecovery)
+     {
+         /*
+          * Our theory for replaying a CREATE is to forcibly drop the target
+          * subdirectory if present, and then recreate it. This may be
+          * more work than needed, but it is simple to implement.
+          */
+         if (stat(location_with_version_dir, &st) == 0 && S_ISDIR(st.st_mode))
+         {
+             if (!rmtree(location_with_version_dir, true))
+                 ereport(WARNING,
+                         (errmsg("some useless files may be left behind in old database directory \"%s\"",
+                                 location_with_version_dir)));
+         }
+     }
+
      /*
       * The creation of the version directory prevents more than one tablespace
       * in a single location.
***************
*** 580,585 ****
--- 597,612 ----
                              location_with_version_dir)));
      }

+     /* Remove old symlink in recovery, in case it points to the wrong place */
+     if (InRecovery)
+     {
+         if (unlink(linkloc) < 0 && errno != ENOENT)
+             ereport(ERROR,
+                     (errcode_for_file_access(),
+                      errmsg("could not remove symbolic link \"%s\": %m",
+                             linkloc)));
+     }
+
      /*
       * Create the symlink under PGDATA
       */

Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

From
Bruce Momjian
Date:
Bruce Momjian wrote:
> Bruce Momjian wrote:
> > > The attached patch does as suggested.  I added the recovery code to the
> > > create tablespace function so I didn't have to duplicate all the code
> > > that computes the path names.
> > >
> > > Attached.
> >
> > Uh, another question.  Looking at the createdb recovery, I see:
> >
> >         /*
> >          * Our theory for replaying a CREATE is to forcibly drop the target
> >          * subdirectory if present, then re-copy the source data. This may be
> >          * more work than needed, but it is simple to implement.
> >          */
> >         if (stat(dst_path, &st) == 0 && S_ISDIR(st.st_mode))
> >         {
> >             if (!rmtree(dst_path, true))
> >                 ereport(WARNING,
> >                         (errmsg("some useless files may be left behind in old database directory \"%s\"",
> >                                 dst_path)));
> >         }
> >
> > Should I be using rmtree() on the mkdir target?
> >
> > Also, the original tablespace recovery code did not drop the symlink
> > first.  I assume that was not a bug only because we don't support moving
> > tablespaces:
>
> For consistency with CREATE DATABASE recovery and for reliablity, I
> coded the rmtree() call instead.  Patch attached.

Attached patch applied to HEAD and 9.0.   9.0 open item moved to
completed.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + None of us is going to be here forever. +
Index: src/backend/commands/dbcommands.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/dbcommands.c,v
retrieving revision 1.235
diff -c -c -r1.235 dbcommands.c
*** src/backend/commands/dbcommands.c    26 Feb 2010 02:00:38 -0000    1.235
--- src/backend/commands/dbcommands.c    20 Jul 2010 18:11:06 -0000
***************
*** 1908,1913 ****
--- 1908,1914 ----
          if (stat(dst_path, &st) == 0 && S_ISDIR(st.st_mode))
          {
              if (!rmtree(dst_path, true))
+                 /* If this failed, copydir() below is going to error. */
                  ereport(WARNING,
                          (errmsg("some useless files may be left behind in old database directory \"%s\"",
                                  dst_path)));
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.77
diff -c -c -r1.77 tablespace.c
*** src/backend/commands/tablespace.c    18 Jul 2010 04:47:46 -0000    1.77
--- src/backend/commands/tablespace.c    20 Jul 2010 18:11:07 -0000
***************
*** 562,567 ****
--- 562,586 ----
                           location)));
      }

+     if (InRecovery)
+     {
+         struct stat st;
+
+         /*
+          * Our theory for replaying a CREATE is to forcibly drop the target
+          * subdirectory if present, and then recreate it. This may be
+          * more work than needed, but it is simple to implement.
+          */
+         if (stat(location_with_version_dir, &st) == 0 && S_ISDIR(st.st_mode))
+         {
+             if (!rmtree(location_with_version_dir, true))
+                 /* If this failed, mkdir() below is going to error. */
+                 ereport(WARNING,
+                         (errmsg("some useless files may be left behind in old database directory \"%s\"",
+                                 location_with_version_dir)));
+         }
+     }
+
      /*
       * The creation of the version directory prevents more than one tablespace
       * in a single location.
***************
*** 580,585 ****
--- 599,614 ----
                              location_with_version_dir)));
      }

+     /* Remove old symlink in recovery, in case it points to the wrong place */
+     if (InRecovery)
+     {
+         if (unlink(linkloc) < 0 && errno != ENOENT)
+             ereport(ERROR,
+                     (errcode_for_file_access(),
+                      errmsg("could not remove symbolic link \"%s\": %m",
+                             linkloc)));
+     }
+
      /*
       * Create the symlink under PGDATA
       */