Re: [HACKERS] Why copy_relation_data only use wal whenWALarchivingis enabled - Mailing list pgsql-patches

From Heikki Linnakangas
Subject Re: [HACKERS] Why copy_relation_data only use wal whenWALarchivingis enabled
Date
Msg-id 4717CC72.6060001@enterprisedb.com
Whole thread Raw
Responses Re: [HACKERS] Why copy_relation_data only use wal whenWALarchivingis enabled
List pgsql-patches
Tom Lane wrote:
> Heikki Linnakangas <heikki@enterprisedb.com> writes:
>> The best I can think of is to rename the obsolete file to
>> <relfilenode>.stale, when it's scheduled for deletion at next
>> checkpoint, and check for .stale-suffixed files in GetNewRelFileNode,
>> and delete them immediately in DropTableSpace.
>
> This is getting too Rube Goldbergian for my tastes.  What if we just
> make DROP TABLESPACE force a checkpoint before proceeding?

Patch attached.

The scenario we're preventing is still possible if for some reason the
latest checkpoint record is damaged, and we start recovery from the
previous checkpoint record. I think the probability of that happening,
together with the OID wrap-around and hitting the relfilenode of a
recently deleted file with a new one, is low enough to not worry about.
If we cared, we could fix it by letting the files to linger for two
checkpoint cycles instead of one.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.286
diff -c -r1.286 xlog.c
*** src/backend/access/transam/xlog.c    12 Oct 2007 19:39:59 -0000    1.286
--- src/backend/access/transam/xlog.c    18 Oct 2007 20:16:56 -0000
***************
*** 45,50 ****
--- 45,51 ----
  #include "storage/fd.h"
  #include "storage/pmsignal.h"
  #include "storage/procarray.h"
+ #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/builtins.h"
  #include "utils/pg_locale.h"
***************
*** 5668,5673 ****
--- 5669,5680 ----
      checkPoint.time = time(NULL);

      /*
+      * Let the md storage manager to prepare for checkpoint before
+      * we determine the REDO pointer.
+      */
+     mdcheckpointbegin();
+
+     /*
       * We must hold WALInsertLock while examining insert state to determine
       * the checkpoint REDO pointer.
       */
***************
*** 5887,5892 ****
--- 5894,5904 ----
      END_CRIT_SECTION();

      /*
+      * Let the md storage manager to do its post-checkpoint work.
+      */
+     mdcheckpointend();
+
+     /*
       * Delete old log files (those no longer needed even for previous
       * checkpoint).
       */
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.49
diff -c -r1.49 tablespace.c
*** src/backend/commands/tablespace.c    1 Aug 2007 22:45:08 -0000    1.49
--- src/backend/commands/tablespace.c    18 Oct 2007 20:31:53 -0000
***************
*** 460,472 ****
      LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);

      /*
!      * Try to remove the physical infrastructure
       */
      if (!remove_tablespace_directories(tablespaceoid, false))
!         ereport(ERROR,
!                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
!                  errmsg("tablespace \"%s\" is not empty",
!                         tablespacename)));

      /* Record the filesystem change in XLOG */
      {
--- 460,488 ----
      LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);

      /*
!      * Try to remove the physical infrastructure.
       */
      if (!remove_tablespace_directories(tablespaceoid, false))
!     {
!         /*
!          * There can be lingering empty files in the directories, left behind
!          * by for example DROP TABLE, that have been scheduled for deletion
!          * at next checkpoint (see comments in mdunlink() for details). We
!          * could just delete them immediately, but we can't tell them apart
!          * from important data files that we mustn't delete. So instead, we
!          * force a checkpoint which will clean out any lingering files, and
!          * try again.
!          */
!         RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
!         if (!remove_tablespace_directories(tablespaceoid, false))
!         {
!             /* Still not empty, the files must be important then */
!             ereport(ERROR,
!                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
!                      errmsg("tablespace \"%s\" is not empty",
!                             tablespacename)));
!         }
!     }

      /* Record the filesystem change in XLOG */
      {
Index: src/backend/storage/smgr/md.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/smgr/md.c,v
retrieving revision 1.129
diff -c -r1.129 md.c
*** src/backend/storage/smgr/md.c    3 Jul 2007 14:51:24 -0000    1.129
--- src/backend/storage/smgr/md.c    18 Oct 2007 21:11:43 -0000
***************
*** 34,39 ****
--- 34,40 ----
  /* special values for the segno arg to RememberFsyncRequest */
  #define FORGET_RELATION_FSYNC    (InvalidBlockNumber)
  #define FORGET_DATABASE_FSYNC    (InvalidBlockNumber-1)
+ #define UNLINK_RELATION_REQUEST    (InvalidBlockNumber-2)

  /*
   * On Windows, we have to interpret EACCES as possibly meaning the same as
***************
*** 113,118 ****
--- 114,123 ----
   * table remembers the pending operations.    We use a hash table mostly as
   * a convenient way of eliminating duplicate requests.
   *
+  * We use a similar mechanism to remember no-longer-needed files that can
+  * be deleted after next checkpoint, but we use a linked list instead of hash
+  * table, because we don't expect there to be any duplicates.
+  *
   * (Regular backends do not track pending operations locally, but forward
   * them to the bgwriter.)
   */
***************
*** 131,139 ****
--- 136,152 ----
      CycleCtr    cycle_ctr;        /* mdsync_cycle_ctr when request was made */
  } PendingOperationEntry;

+ typedef struct
+ {
+     RelFileNode rnode;            /* the dead relation to delete */
+     CycleCtr cycle_ctr;            /* mdunlink_cycle_ctr when request was made */
+ } PendingUnlinkEntry;
+
  static HTAB *pendingOpsTable = NULL;
+ static List *pendingUnlinks = NIL;

  static CycleCtr mdsync_cycle_ctr = 0;
+ static CycleCtr mdunlink_cycle_ctr = 0;


  typedef enum                    /* behavior for mdopen & _mdfd_getseg */
***************
*** 146,151 ****
--- 159,165 ----
  /* local routines */
  static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
  static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
+ static void register_unlink(RelFileNode rnode);
  static MdfdVec *_fdvec_alloc(void);

  #ifndef LET_OS_MANAGE_FILESIZE
***************
*** 188,193 ****
--- 202,208 ----
                                        100L,
                                        &hash_ctl,
                                     HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+         pendingUnlinks = NIL;
      }
  }

***************
*** 257,267 ****
--- 272,300 ----
   * If isRedo is true, it's okay for the relation to be already gone.
   * Also, any failure should be reported as WARNING not ERROR, because
   * we are usually not in a transaction anymore when this is called.
+  *
+  * If isRedo is false, the relation is actually just truncated to release
+  * the space, and left to linger as an empty file until the next checkpoint.
+  * This is to avoid reusing the same relfilenode for a new relation file,
+  * until the commit record containing the deletion has been flushed out.
+  *
+  * The scenario we're trying to protect from is this:
+  * After crash, we replay the commit WAL record of a transaction that
+  * deleted a relation, like DROP TABLE. But before the crash, after deleting
+  * the old relation, we created a new one, and it happened to get the same
+  * relfilenode as the deleted relation (OID must've wrapped around for
+  * that to happen). Now replaying the deletion of the old relation
+  * deletes the new one instead, because they had the same relfilenode.
+  * Normally the new relation would be recreated later in the WAL replay, but
+  * if we relied on fsyncing the file after populating it,  like CLUSTER and
+  * CREATE INDEX do, for example, the contents of the file would be lost
+  * forever.
   */
  void
  mdunlink(RelFileNode rnode, bool isRedo)
  {
      char       *path;
+     int ret;

      /*
       * We have to clean out any pending fsync requests for the doomed relation,
***************
*** 271,278 ****

      path = relpath(rnode);

!     /* Delete the first segment, or only segment if not doing segmenting */
!     if (unlink(path) < 0)
      {
          if (!isRedo || errno != ENOENT)
              ereport(WARNING,
--- 304,318 ----

      path = relpath(rnode);

!     /*
!      * Delete or truncate the first segment, or only segment if not doing
!      * segmenting
!      */
!     if (!isRedo)
!         ret = truncate(path, 0);
!     else
!         ret = unlink(path);
!     if (ret < 0)
      {
          if (!isRedo || errno != ENOENT)
              ereport(WARNING,
***************
*** 316,321 ****
--- 356,364 ----
  #endif

      pfree(path);
+
+     if (!isRedo)
+         register_unlink(rnode);
  }

  /*
***************
*** 1096,1114 ****
      }
  }

  /*
   * RememberFsyncRequest() -- callback from bgwriter side of fsync request
   *
!  * We stuff the fsync request into the local hash table for execution
   * during the bgwriter's next checkpoint.
   *
   * The range of possible segment numbers is way less than the range of
   * BlockNumber, so we can reserve high values of segno for special purposes.
!  * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
!  * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
!  * a whole database.  (These are a tad slow because the hash table has to be
!  * searched linearly, but it doesn't seem worth rethinking the table structure
!  * for them.)
   */
  void
  RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
--- 1139,1268 ----
      }
  }

+
+ /*
+  * register_unlink() -- Schedule a relation to be deleted after next checkpoint
+  */
+ static void
+ register_unlink(RelFileNode rnode)
+ {
+     if (pendingOpsTable)
+     {
+         /* standalone backend or startup process: fsync state is local */
+         RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
+     }
+     else if (IsUnderPostmaster)
+     {
+         /*
+          * Notify the bgwriter about it.  If we fail to queue the revoke
+          * message, we have to sleep and try again ... ugly, but hopefully
+          * won't happen often.
+          *
+          * XXX should we just leave the file orphaned instead?
+          */
+         while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
+             pg_usleep(10000L);    /* 10 msec seems a good number */
+     }
+ }
+
+ /*
+  * mdcheckpointbegin() -- Do pre-checkpoint work
+  *
+  * To distinguish unlink requests that arrived before this checkpoint
+  * started and those that arrived during the checkpoint, we use a cycle
+  * counter similar to the one we use for fsync requests. That cycle
+  * counter is incremented here.
+  *
+  * This must called *before* RedoRecPtr is determined.
+  */
+ void
+ mdcheckpointbegin()
+ {
+     PendingUnlinkEntry *entry;
+     ListCell *cell;
+
+     /*
+      * Just in case the prior checkpoint failed, stamp all entries in
+      * the list with the current cycle counter. Anything that's in the
+      * list at the start of checkpoint can surely be deleted after the
+      * checkpoint is finished, regardless of when the request was made.
+      */
+     foreach(cell, pendingUnlinks)
+     {
+         entry = lfirst(cell);
+         entry->cycle_ctr = mdunlink_cycle_ctr;
+     }
+
+     /*
+      * Any unlink requests arriving after this point will be assigned the
+      * next cycle counter, and won't be unlinked until next checkpoint.
+      */
+     mdunlink_cycle_ctr++;
+ }
+
+ /*
+  * mdcheckpointend() -- Do post-checkpoint work
+  *
+  * Remove any lingering files that can now be safely removed.
+  */
+ void
+ mdcheckpointend()
+ {
+     while(pendingUnlinks != NIL)
+     {
+         PendingUnlinkEntry *entry = linitial(pendingUnlinks);
+         char *path;
+
+         /*
+          * New entries are appended to the end, so if the entry is new
+          * we've reached the end of old entries.
+          */
+         if (entry->cycle_ctr == mdsync_cycle_ctr)
+             break;
+
+         /* Else assert we haven't missed it */
+         Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
+
+         /* Unlink the file */
+         path = relpath(entry->rnode);
+         if (unlink(path) < 0)
+         {
+             /*
+              * ENOENT shouldn't happen either, but it doesn't really matter
+              * because we would've deleted it now anyway.
+              */
+             if (errno != ENOENT)
+                 ereport(WARNING,
+                         (errcode_for_file_access(),
+                          errmsg("could not remove relation %u/%u/%u: %m",
+                                 entry->rnode.spcNode,
+                                 entry->rnode.dbNode,
+                                 entry->rnode.relNode)));
+         }
+         pfree(path);
+
+         pendingUnlinks = list_delete_first(pendingUnlinks);
+     }
+ }
+
  /*
   * RememberFsyncRequest() -- callback from bgwriter side of fsync request
   *
!  * We stuff normal fsync request into the local hash table for execution
   * during the bgwriter's next checkpoint.
   *
   * The range of possible segment numbers is way less than the range of
   * BlockNumber, so we can reserve high values of segno for special purposes.
!  * We define three:
!  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
!  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
!  * - UNLINK_REQUEST is a request to delete a lingering file after next
!  *   checkpoint. These are stuffed into a linked list separate from
!  *   the normal fsync requests.
!  *
!  * (Handling the FORGET_* requests is a tad slow because the hash table has to
!  * be searched linearly, but it doesn't seem worth rethinking the table
!  * structure for them.)
   */
  void
  RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
***************
*** 1147,1152 ****
--- 1301,1322 ----
              }
          }
      }
+     else if (segno == UNLINK_RELATION_REQUEST)
+     {
+         /* Unlink request: put it in the linked list */
+         MemoryContext oldcxt;
+         PendingUnlinkEntry *entry;
+
+         oldcxt = MemoryContextSwitchTo(MdCxt);
+
+         entry = palloc(sizeof(PendingUnlinkEntry));
+         memcpy(&entry->rnode, &rnode, sizeof(RelFileNode));
+         entry->cycle_ctr = mdunlink_cycle_ctr;
+
+         pendingUnlinks = lappend(pendingUnlinks, entry);
+
+         MemoryContextSwitchTo(oldcxt);
+     }
      else
      {
          /* Normal case: enter a request to fsync this segment */
Index: src/include/storage/smgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/smgr.h,v
retrieving revision 1.59
diff -c -r1.59 smgr.h
*** src/include/storage/smgr.h    5 Sep 2007 18:10:48 -0000    1.59
--- src/include/storage/smgr.h    18 Oct 2007 09:52:43 -0000
***************
*** 110,115 ****
--- 110,118 ----
  extern void ForgetRelationFsyncRequests(RelFileNode rnode);
  extern void ForgetDatabaseFsyncRequests(Oid dbid);

+ extern void mdcheckpointbegin(void);
+ extern void mdcheckpointend(void);
+
  /* smgrtype.c */
  extern Datum smgrout(PG_FUNCTION_ARGS);
  extern Datum smgrin(PG_FUNCTION_ARGS);

pgsql-patches by date:

Previous
From: "Marko Kreen"
Date:
Subject: Re: [RFC] extended txid docs
Next
From: "Marko Kreen"
Date:
Subject: txid cleanup