Re: Idea for improving buildfarm robustness - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Idea for improving buildfarm robustness
Date
Msg-id 6133.1443562178@sss.pgh.pa.us
Whole thread Raw
In response to Re: Idea for improving buildfarm robustness  (Josh Berkus <josh@agliodbs.com>)
Responses Re: Idea for improving buildfarm robustness
List pgsql-hackers
Josh Berkus <josh@agliodbs.com> writes:
> Give me source with the change, and I'll put it on a cheap, low-bandwith
> AWS instance and hammer the heck out of it.  That should raise any false
> positives we can expect.

Here's a draft patch against HEAD (looks like it will work on 9.5 or
9.4 without modifications, too).

            regards, tom lane

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index baa43b2..52c9acd 100644
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
*************** static void CloseServerPorts(int status,
*** 373,378 ****
--- 373,379 ----
  static void unlink_external_pid_file(int status, Datum arg);
  static void getInstallationPaths(const char *argv0);
  static void checkDataDir(void);
+ static bool recheckDataDir(void);
  static Port *ConnCreate(int serverFd);
  static void ConnFree(Port *port);
  static void reset_shared(int port);
*************** checkDataDir(void)
*** 1490,1495 ****
--- 1491,1539 ----
  }

  /*
+  * Revalidate the data directory; return TRUE if OK, FALSE if not
+  *
+  * We don't try to check everything that checkDataDir() does.  Ideally, we'd
+  * return FALSE *only* if the data directory has been deleted.  As a proxy
+  * for that that matches a condition that checkDataDir() checked, verify that
+  * pg_control still exists.  Because the postmaster will quit if we return
+  * FALSE, do not do so if there is any doubt or possibly-transient failure.
+  * Better to wait till we're sure.
+  *
+  * Unlike checkDataDir(), we assume we've chdir'd into the data directory.
+  */
+ static bool
+ recheckDataDir(void)
+ {
+     const char *path = "global/pg_control";
+     FILE       *fp;
+
+     fp = AllocateFile(path, PG_BINARY_R);
+     if (fp != NULL)
+     {
+         FreeFile(fp);
+         return true;
+     }
+
+     /*
+      * There are many foreseeable false-positive error conditions, for example
+      * EINTR or ENFILE should not cause us to fail.  For safety, fail only on
+      * enumerated clearly-something-is-wrong conditions.
+      */
+     switch (errno)
+     {
+         case ENOENT:
+         case ENOTDIR:
+             elog(LOG, "could not open file \"%s\": %m", path);
+             return false;
+         default:
+             elog(LOG, "could not open file \"%s\": %m; continuing anyway",
+                  path);
+             return true;
+     }
+ }
+
+ /*
   * Determine how long should we let ServerLoop sleep.
   *
   * In normal conditions we wait at most one minute, to ensure that the other
*************** ServerLoop(void)
*** 1602,1610 ****
      fd_set        readmask;
      int            nSockets;
      time_t        now,
                  last_touch_time;

!     last_touch_time = time(NULL);

      nSockets = initMasks(&readmask);

--- 1646,1655 ----
      fd_set        readmask;
      int            nSockets;
      time_t        now,
+                 last_dir_recheck_time,
                  last_touch_time;

!     last_dir_recheck_time = last_touch_time = time(NULL);

      nSockets = initMasks(&readmask);

*************** ServerLoop(void)
*** 1754,1772 ****
          if (StartWorkerNeeded || HaveCrashedWorker)
              maybe_start_bgworker();

-         /*
-          * Touch Unix socket and lock files every 58 minutes, to ensure that
-          * they are not removed by overzealous /tmp-cleaning tasks.  We assume
-          * no one runs cleaners with cutoff times of less than an hour ...
-          */
-         now = time(NULL);
-         if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
-         {
-             TouchSocketFiles();
-             TouchSocketLockFiles();
-             last_touch_time = now;
-         }
-
  #ifdef HAVE_PTHREAD_IS_THREADED_NP

          /*
--- 1799,1804 ----
*************** ServerLoop(void)
*** 1793,1798 ****
--- 1825,1868 ----
              /* reset flag so we don't SIGKILL again */
              AbortStartTime = 0;
          }
+
+         /*
+          * Lastly, check to see if it's time to do some things that we don't
+          * want to do every single time through the loop, because they're a
+          * bit expensive.  Note that there's up to a minute of slop in when
+          * these tasks will be performed, since DetermineSleepTime() will let
+          * us sleep at most that long.
+          */
+         now = time(NULL);
+
+         /*
+          * Once a minute, verify that $PGDATA hasn't been removed.  If it has,
+          * we want to commit hara-kiri.  This avoids having postmasters and
+          * child processes hanging around after their database is gone, and
+          * maybe causing problems if a new database cluster is created in the
+          * same place.
+          */
+         if (now - last_dir_recheck_time >= 1 * SECS_PER_MINUTE)
+         {
+             if (!recheckDataDir())
+             {
+                 elog(LOG, "shutting down because data directory is gone");
+                 kill(MyProcPid, SIGQUIT);
+             }
+             last_dir_recheck_time = now;
+         }
+
+         /*
+          * Touch Unix socket and lock files every 58 minutes, to ensure that
+          * they are not removed by overzealous /tmp-cleaning tasks.  We assume
+          * no one runs cleaners with cutoff times of less than an hour ...
+          */
+         if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
+         {
+             TouchSocketFiles();
+             TouchSocketLockFiles();
+             last_touch_time = now;
+         }
      }
  }


pgsql-hackers by date:

Previous
From: Adam Brightwell
Date:
Subject: Re: Arguable RLS security bug, EvalPlanQual() paranoia
Next
From: Josh Elsasser
Date:
Subject: Add pg_basebackup single tar output format