Re: [HACKERS] Why copy_relation_data only use wal whenWALarchivingis enabled - Mailing list pgsql-patches

From Heikki Linnakangas
Subject Re: [HACKERS] Why copy_relation_data only use wal whenWALarchivingis enabled
Date
Msg-id 471A0D42.1020507@enterprisedb.com
Whole thread Raw
In response to Re: [HACKERS] Why copy_relation_data only use wal whenWALarchivingis enabled  ("Heikki Linnakangas" <heikki@enterprisedb.com>)
Responses Re: [HACKERS] Why copy_relation_data only use wal whenWALarchivingis enabled
List pgsql-patches
Here's an updated version of the patch. There was a bogus assertion in
the previous one, comparing against mdsync_cycle_ctr instead of
mdunlink_cycle_ctr.

Heikki Linnakangas wrote:
> Tom Lane wrote:
>> Heikki Linnakangas <heikki@enterprisedb.com> writes:
>>> The best I can think of is to rename the obsolete file to
>>> <relfilenode>.stale, when it's scheduled for deletion at next
>>> checkpoint, and check for .stale-suffixed files in GetNewRelFileNode,
>>> and delete them immediately in DropTableSpace.
>> This is getting too Rube Goldbergian for my tastes.  What if we just
>> make DROP TABLESPACE force a checkpoint before proceeding?
>
> Patch attached.
>
> The scenario we're preventing is still possible if for some reason the
> latest checkpoint record is damaged, and we start recovery from the
> previous checkpoint record. I think the probability of that happening,
> together with the OID wrap-around and hitting the relfilenode of a
> recently deleted file with a new one, is low enough to not worry about.
> If we cared, we could fix it by letting the files to linger for two
> checkpoint cycles instead of one.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.286
diff -c -r1.286 xlog.c
*** src/backend/access/transam/xlog.c    12 Oct 2007 19:39:59 -0000    1.286
--- src/backend/access/transam/xlog.c    18 Oct 2007 20:16:56 -0000
***************
*** 45,50 ****
--- 45,51 ----
  #include "storage/fd.h"
  #include "storage/pmsignal.h"
  #include "storage/procarray.h"
+ #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/builtins.h"
  #include "utils/pg_locale.h"
***************
*** 5668,5673 ****
--- 5669,5680 ----
      checkPoint.time = time(NULL);

      /*
+      * Let the md storage manager to prepare for checkpoint before
+      * we determine the REDO pointer.
+      */
+     mdcheckpointbegin();
+
+     /*
       * We must hold WALInsertLock while examining insert state to determine
       * the checkpoint REDO pointer.
       */
***************
*** 5887,5892 ****
--- 5894,5904 ----
      END_CRIT_SECTION();

      /*
+      * Let the md storage manager to do its post-checkpoint work.
+      */
+     mdcheckpointend();
+
+     /*
       * Delete old log files (those no longer needed even for previous
       * checkpoint).
       */
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.49
diff -c -r1.49 tablespace.c
*** src/backend/commands/tablespace.c    1 Aug 2007 22:45:08 -0000    1.49
--- src/backend/commands/tablespace.c    18 Oct 2007 20:31:53 -0000
***************
*** 460,472 ****
      LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);

      /*
!      * Try to remove the physical infrastructure
       */
      if (!remove_tablespace_directories(tablespaceoid, false))
!         ereport(ERROR,
!                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
!                  errmsg("tablespace \"%s\" is not empty",
!                         tablespacename)));

      /* Record the filesystem change in XLOG */
      {
--- 460,488 ----
      LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);

      /*
!      * Try to remove the physical infrastructure.
       */
      if (!remove_tablespace_directories(tablespaceoid, false))
!     {
!         /*
!          * There can be lingering empty files in the directories, left behind
!          * by for example DROP TABLE, that have been scheduled for deletion
!          * at next checkpoint (see comments in mdunlink() for details). We
!          * could just delete them immediately, but we can't tell them apart
!          * from important data files that we mustn't delete. So instead, we
!          * force a checkpoint which will clean out any lingering files, and
!          * try again.
!          */
!         RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
!         if (!remove_tablespace_directories(tablespaceoid, false))
!         {
!             /* Still not empty, the files must be important then */
!             ereport(ERROR,
!                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
!                      errmsg("tablespace \"%s\" is not empty",
!                             tablespacename)));
!         }
!     }

      /* Record the filesystem change in XLOG */
      {
Index: src/backend/storage/smgr/md.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/smgr/md.c,v
retrieving revision 1.129
diff -c -r1.129 md.c
*** src/backend/storage/smgr/md.c    3 Jul 2007 14:51:24 -0000    1.129
--- src/backend/storage/smgr/md.c    20 Oct 2007 14:10:08 -0000
***************
*** 34,39 ****
--- 34,40 ----
  /* special values for the segno arg to RememberFsyncRequest */
  #define FORGET_RELATION_FSYNC    (InvalidBlockNumber)
  #define FORGET_DATABASE_FSYNC    (InvalidBlockNumber-1)
+ #define UNLINK_RELATION_REQUEST    (InvalidBlockNumber-2)

  /*
   * On Windows, we have to interpret EACCES as possibly meaning the same as
***************
*** 113,118 ****
--- 114,123 ----
   * table remembers the pending operations.    We use a hash table mostly as
   * a convenient way of eliminating duplicate requests.
   *
+  * We use a similar mechanism to remember no-longer-needed files that can
+  * be deleted after next checkpoint, but we use a linked list instead of hash
+  * table, because we don't expect there to be any duplicates.
+  *
   * (Regular backends do not track pending operations locally, but forward
   * them to the bgwriter.)
   */
***************
*** 131,139 ****
--- 136,152 ----
      CycleCtr    cycle_ctr;        /* mdsync_cycle_ctr when request was made */
  } PendingOperationEntry;

+ typedef struct
+ {
+     RelFileNode rnode;            /* the dead relation to delete */
+     CycleCtr cycle_ctr;            /* mdunlink_cycle_ctr when request was made */
+ } PendingUnlinkEntry;
+
  static HTAB *pendingOpsTable = NULL;
+ static List *pendingUnlinks = NIL;

  static CycleCtr mdsync_cycle_ctr = 0;
+ static CycleCtr mdunlink_cycle_ctr = 0;


  typedef enum                    /* behavior for mdopen & _mdfd_getseg */
***************
*** 146,151 ****
--- 159,165 ----
  /* local routines */
  static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
  static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
+ static void register_unlink(RelFileNode rnode);
  static MdfdVec *_fdvec_alloc(void);

  #ifndef LET_OS_MANAGE_FILESIZE
***************
*** 188,193 ****
--- 202,208 ----
                                        100L,
                                        &hash_ctl,
                                     HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+         pendingUnlinks = NIL;
      }
  }

***************
*** 257,267 ****
--- 272,300 ----
   * If isRedo is true, it's okay for the relation to be already gone.
   * Also, any failure should be reported as WARNING not ERROR, because
   * we are usually not in a transaction anymore when this is called.
+  *
+  * If isRedo is false, the relation is actually just truncated to release
+  * the space, and left to linger as an empty file until the next checkpoint.
+  * This is to avoid reusing the same relfilenode for a new relation file,
+  * until the commit record containing the deletion has been flushed out.
+  *
+  * The scenario we're trying to protect from is this:
+  * After crash, we replay the commit WAL record of a transaction that
+  * deleted a relation, like DROP TABLE. But before the crash, after deleting
+  * the old relation, we created a new one, and it happened to get the same
+  * relfilenode as the deleted relation (OID must've wrapped around for
+  * that to happen). Now replaying the deletion of the old relation
+  * deletes the new one instead, because they had the same relfilenode.
+  * Normally the new relation would be recreated later in the WAL replay, but
+  * if we relied on fsyncing the file after populating it,  like CLUSTER and
+  * CREATE INDEX do, for example, the contents of the file would be lost
+  * forever.
   */
  void
  mdunlink(RelFileNode rnode, bool isRedo)
  {
      char       *path;
+     int ret;

      /*
       * We have to clean out any pending fsync requests for the doomed relation,
***************
*** 271,278 ****

      path = relpath(rnode);

!     /* Delete the first segment, or only segment if not doing segmenting */
!     if (unlink(path) < 0)
      {
          if (!isRedo || errno != ENOENT)
              ereport(WARNING,
--- 304,318 ----

      path = relpath(rnode);

!     /*
!      * Delete or truncate the first segment, or only segment if not doing
!      * segmenting
!      */
!     if (!isRedo)
!         ret = truncate(path, 0);
!     else
!         ret = unlink(path);
!     if (ret < 0)
      {
          if (!isRedo || errno != ENOENT)
              ereport(WARNING,
***************
*** 316,321 ****
--- 356,364 ----
  #endif

      pfree(path);
+
+     if (!isRedo)
+         register_unlink(rnode);
  }

  /*
***************
*** 1096,1114 ****
      }
  }

  /*
   * RememberFsyncRequest() -- callback from bgwriter side of fsync request
   *
!  * We stuff the fsync request into the local hash table for execution
   * during the bgwriter's next checkpoint.
   *
   * The range of possible segment numbers is way less than the range of
   * BlockNumber, so we can reserve high values of segno for special purposes.
!  * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
!  * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
!  * a whole database.  (These are a tad slow because the hash table has to be
!  * searched linearly, but it doesn't seem worth rethinking the table structure
!  * for them.)
   */
  void
  RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
--- 1139,1268 ----
      }
  }

+
+ /*
+  * register_unlink() -- Schedule a relation to be deleted after next checkpoint
+  */
+ static void
+ register_unlink(RelFileNode rnode)
+ {
+     if (pendingOpsTable)
+     {
+         /* standalone backend or startup process: fsync state is local */
+         RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
+     }
+     else if (IsUnderPostmaster)
+     {
+         /*
+          * Notify the bgwriter about it.  If we fail to queue the revoke
+          * message, we have to sleep and try again ... ugly, but hopefully
+          * won't happen often.
+          *
+          * XXX should we just leave the file orphaned instead?
+          */
+         while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
+             pg_usleep(10000L);    /* 10 msec seems a good number */
+     }
+ }
+
+ /*
+  * mdcheckpointbegin() -- Do pre-checkpoint work
+  *
+  * To distinguish unlink requests that arrived before this checkpoint
+  * started and those that arrived during the checkpoint, we use a cycle
+  * counter similar to the one we use for fsync requests. That cycle
+  * counter is incremented here.
+  *
+  * This must called *before* RedoRecPtr is determined.
+  */
+ void
+ mdcheckpointbegin()
+ {
+     PendingUnlinkEntry *entry;
+     ListCell *cell;
+
+     /*
+      * Just in case the prior checkpoint failed, stamp all entries in
+      * the list with the current cycle counter. Anything that's in the
+      * list at the start of checkpoint can surely be deleted after the
+      * checkpoint is finished, regardless of when the request was made.
+      */
+     foreach(cell, pendingUnlinks)
+     {
+         entry = lfirst(cell);
+         entry->cycle_ctr = mdunlink_cycle_ctr;
+     }
+
+     /*
+      * Any unlink requests arriving after this point will be assigned the
+      * next cycle counter, and won't be unlinked until next checkpoint.
+      */
+     mdunlink_cycle_ctr++;
+ }
+
+ /*
+  * mdcheckpointend() -- Do post-checkpoint work
+  *
+  * Remove any lingering files that can now be safely removed.
+  */
+ void
+ mdcheckpointend()
+ {
+     while(pendingUnlinks != NIL)
+     {
+         PendingUnlinkEntry *entry = linitial(pendingUnlinks);
+         char *path;
+
+         /*
+          * New entries are appended to the end, so if the entry is new
+          * we've reached the end of old entries.
+          */
+         if (entry->cycle_ctr == mdsync_cycle_ctr)
+             break;
+
+         /* Else assert we haven't missed it */
+         Assert((CycleCtr) (entry->cycle_ctr + 1) == mdunlink_cycle_ctr);
+
+         /* Unlink the file */
+         path = relpath(entry->rnode);
+         if (unlink(path) < 0)
+         {
+             /*
+              * ENOENT shouldn't happen either, but it doesn't really matter
+              * because we would've deleted it now anyway.
+              */
+             if (errno != ENOENT)
+                 ereport(WARNING,
+                         (errcode_for_file_access(),
+                          errmsg("could not remove relation %u/%u/%u: %m",
+                                 entry->rnode.spcNode,
+                                 entry->rnode.dbNode,
+                                 entry->rnode.relNode)));
+         }
+         pfree(path);
+
+         pendingUnlinks = list_delete_first(pendingUnlinks);
+     }
+ }
+
  /*
   * RememberFsyncRequest() -- callback from bgwriter side of fsync request
   *
!  * We stuff normal fsync request into the local hash table for execution
   * during the bgwriter's next checkpoint.
   *
   * The range of possible segment numbers is way less than the range of
   * BlockNumber, so we can reserve high values of segno for special purposes.
!  * We define three:
!  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
!  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
!  * - UNLINK_REQUEST is a request to delete a lingering file after next
!  *   checkpoint. These are stuffed into a linked list separate from
!  *   the normal fsync requests.
!  *
!  * (Handling the FORGET_* requests is a tad slow because the hash table has to
!  * be searched linearly, but it doesn't seem worth rethinking the table
!  * structure for them.)
   */
  void
  RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
***************
*** 1147,1152 ****
--- 1301,1322 ----
              }
          }
      }
+     else if (segno == UNLINK_RELATION_REQUEST)
+     {
+         /* Unlink request: put it in the linked list */
+         MemoryContext oldcxt;
+         PendingUnlinkEntry *entry;
+
+         oldcxt = MemoryContextSwitchTo(MdCxt);
+
+         entry = palloc(sizeof(PendingUnlinkEntry));
+         memcpy(&entry->rnode, &rnode, sizeof(RelFileNode));
+         entry->cycle_ctr = mdunlink_cycle_ctr;
+
+         pendingUnlinks = lappend(pendingUnlinks, entry);
+
+         MemoryContextSwitchTo(oldcxt);
+     }
      else
      {
          /* Normal case: enter a request to fsync this segment */
Index: src/include/storage/smgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/smgr.h,v
retrieving revision 1.59
diff -c -r1.59 smgr.h
*** src/include/storage/smgr.h    5 Sep 2007 18:10:48 -0000    1.59
--- src/include/storage/smgr.h    18 Oct 2007 09:52:43 -0000
***************
*** 110,115 ****
--- 110,118 ----
  extern void ForgetRelationFsyncRequests(RelFileNode rnode);
  extern void ForgetDatabaseFsyncRequests(Oid dbid);

+ extern void mdcheckpointbegin(void);
+ extern void mdcheckpointend(void);
+
  /* smgrtype.c */
  extern Datum smgrout(PG_FUNCTION_ARGS);
  extern Datum smgrin(PG_FUNCTION_ARGS);

pgsql-patches by date:

Previous
From: "Marko Kreen"
Date:
Subject: txid cleanup
Next
From: Greg Sabino Mullane
Date:
Subject: Better psql tab-completion support for schemas and tables