Re: Report: race conditions in WAL replay routines - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Report: race conditions in WAL replay routines
Date
Msg-id 23714.1328483673@sss.pgh.pa.us
Whole thread Raw
In response to Re: Report: race conditions in WAL replay routines  (Simon Riggs <simon@2ndQuadrant.com>)
Responses Re: Report: race conditions in WAL replay routines
List pgsql-hackers
Simon Riggs <simon@2ndQuadrant.com> writes:
> Please post the patch rather than fixing directly. There's some subtle
> stuff there and it would be best to discuss first.

And here's a proposed patch for not throwing ERROR during replay of DROP
TABLESPACE.  I had originally thought this would be a one-liner
s/ERROR/LOG/, but on inspection destroy_tablespace_directories() really
needs to be changed too, so that it doesn't throw error for unremovable
directories.

            regards, tom lane

diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 8ed6d06bb2370a54349f2a86ee13e08a381eacf2..ed0c9ea3ee5a372c6926f41fc0fc93ed7d80d6cd 100644
*** a/src/backend/commands/tablespace.c
--- b/src/backend/commands/tablespace.c
*************** create_tablespace_directories(const char
*** 626,634 ****
  /*
   * destroy_tablespace_directories
   *
!  * Attempt to remove filesystem infrastructure
   *
!  * 'redo' indicates we are redoing a drop from XLOG; okay if nothing there
   *
   * Returns TRUE if successful, FALSE if some subdirectory is not empty
   */
--- 626,638 ----
  /*
   * destroy_tablespace_directories
   *
!  * Attempt to remove filesystem infrastructure for the tablespace.
   *
!  * 'redo' indicates we are redoing a drop from XLOG; in that case we should
!  * not throw an ERROR for problems, just LOG them.  The worst consequence of
!  * not removing files here would be failure to release some disk space, which
!  * does not justify throwing an error that would require manual intervention
!  * to get the database running again.
   *
   * Returns TRUE if successful, FALSE if some subdirectory is not empty
   */
*************** destroy_tablespace_directories(Oid table
*** 681,686 ****
--- 685,700 ----
              pfree(linkloc_with_version_dir);
              return true;
          }
+         else if (redo)
+         {
+             /* in redo, just log other types of error */
+             ereport(LOG,
+                     (errcode_for_file_access(),
+                      errmsg("could not open directory \"%s\": %m",
+                             linkloc_with_version_dir)));
+             pfree(linkloc_with_version_dir);
+             return false;
+         }
          /* else let ReadDir report the error */
      }

*************** destroy_tablespace_directories(Oid table
*** 704,710 ****

          /* remove empty directory */
          if (rmdir(subfile) < 0)
!             ereport(ERROR,
                      (errcode_for_file_access(),
                       errmsg("could not remove directory \"%s\": %m",
                              subfile)));
--- 718,724 ----

          /* remove empty directory */
          if (rmdir(subfile) < 0)
!             ereport(redo ? LOG : ERROR,
                      (errcode_for_file_access(),
                       errmsg("could not remove directory \"%s\": %m",
                              subfile)));
*************** destroy_tablespace_directories(Oid table
*** 716,738 ****

      /* remove version directory */
      if (rmdir(linkloc_with_version_dir) < 0)
!         ereport(ERROR,
                  (errcode_for_file_access(),
                   errmsg("could not remove directory \"%s\": %m",
                          linkloc_with_version_dir)));

      /*
       * Try to remove the symlink.  We must however deal with the possibility
       * that it's a directory instead of a symlink --- this could happen during
       * WAL replay (see TablespaceCreateDbspace), and it is also the case on
       * Windows where junction points lstat() as directories.
       */
      linkloc = pstrdup(linkloc_with_version_dir);
      get_parent_directory(linkloc);
      if (lstat(linkloc, &st) == 0 && S_ISDIR(st.st_mode))
      {
          if (rmdir(linkloc) < 0)
!             ereport(ERROR,
                      (errcode_for_file_access(),
                       errmsg("could not remove directory \"%s\": %m",
                              linkloc)));
--- 730,759 ----

      /* remove version directory */
      if (rmdir(linkloc_with_version_dir) < 0)
!     {
!         ereport(redo ? LOG : ERROR,
                  (errcode_for_file_access(),
                   errmsg("could not remove directory \"%s\": %m",
                          linkloc_with_version_dir)));
+         pfree(linkloc_with_version_dir);
+         return false;
+     }

      /*
       * Try to remove the symlink.  We must however deal with the possibility
       * that it's a directory instead of a symlink --- this could happen during
       * WAL replay (see TablespaceCreateDbspace), and it is also the case on
       * Windows where junction points lstat() as directories.
+      *
+      * Note: in the redo case, we'll return true if this final step fails;
+      * there's no point in retrying it.
       */
      linkloc = pstrdup(linkloc_with_version_dir);
      get_parent_directory(linkloc);
      if (lstat(linkloc, &st) == 0 && S_ISDIR(st.st_mode))
      {
          if (rmdir(linkloc) < 0)
!             ereport(redo ? LOG : ERROR,
                      (errcode_for_file_access(),
                       errmsg("could not remove directory \"%s\": %m",
                              linkloc)));
*************** destroy_tablespace_directories(Oid table
*** 740,746 ****
      else
      {
          if (unlink(linkloc) < 0)
!             ereport(ERROR,
                      (errcode_for_file_access(),
                       errmsg("could not remove symbolic link \"%s\": %m",
                              linkloc)));
--- 761,767 ----
      else
      {
          if (unlink(linkloc) < 0)
!             ereport(redo ? LOG : ERROR,
                      (errcode_for_file_access(),
                       errmsg("could not remove symbolic link \"%s\": %m",
                              linkloc)));
*************** tblspc_redo(XLogRecPtr lsn, XLogRecord *
*** 1451,1464 ****
          xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);

          /*
!          * If we issued a WAL record for a drop tablespace it is because there
!          * were no files in it at all. That means that no permanent objects
!          * can exist in it at this point.
           *
           * It is possible for standby users to be using this tablespace as a
           * location for their temporary files, so if we fail to remove all
           * files then do conflict processing and try again, if currently
           * enabled.
           */
          if (!destroy_tablespace_directories(xlrec->ts_id, true))
          {
--- 1472,1490 ----
          xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);

          /*
!          * If we issued a WAL record for a drop tablespace it implies that
!          * there were no files in it at all when the DROP was done. That means
!          * that no permanent objects can exist in it at this point.
           *
           * It is possible for standby users to be using this tablespace as a
           * location for their temporary files, so if we fail to remove all
           * files then do conflict processing and try again, if currently
           * enabled.
+          *
+          * Other possible reasons for failure include bollixed file permissions
+          * on a standby server when they were okay on the primary, etc etc.
+          * There's not much we can do about that, so just remove what we can
+          * and press on.
           */
          if (!destroy_tablespace_directories(xlrec->ts_id, true))
          {
*************** tblspc_redo(XLogRecPtr lsn, XLogRecord *
*** 1466,1480 ****

              /*
               * If we did recovery processing then hopefully the backends who
!              * wrote temp files should have cleaned up and exited by now. So
!              * lets recheck before we throw an error. If !process_conflicts
!              * then this will just fail again.
               */
              if (!destroy_tablespace_directories(xlrec->ts_id, true))
!                 ereport(ERROR,
                          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                           errmsg("tablespace %u is not empty",
!                                 xlrec->ts_id)));
          }
      }
      else
--- 1492,1509 ----

              /*
               * If we did recovery processing then hopefully the backends who
!              * wrote temp files should have cleaned up and exited by now.  So
!              * retry before complaining.  If we fail again, this is just a LOG
!              * condition, because it's not worth throwing an ERROR for (as
!              * that would crash the database and require manual intervention
!              * before we could get past this WAL record on restart).
               */
              if (!destroy_tablespace_directories(xlrec->ts_id, true))
!                 ereport(LOG,
                          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                           errmsg("tablespace %u is not empty",
!                                 xlrec->ts_id),
!                          errdetail("Continuing even though some files could not be removed.")));
          }
      }
      else

pgsql-hackers by date:

Previous
From: Noah Misch
Date:
Subject: Re: initdb and fsync
Next
From: Shigeru Hanada
Date:
Subject: Re: pgsql_fdw, FDW for PostgreSQL server