Re: pg_ctl/pg_rewind tests vs. slow AIX buildfarm members - Mailing list pgsql-hackers

From Tom Lane
Subject Re: pg_ctl/pg_rewind tests vs. slow AIX buildfarm members
Date
Msg-id 31376.1441308666@sss.pgh.pa.us
Whole thread Raw
In response to Re: pg_ctl/pg_rewind tests vs. slow AIX buildfarm members  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: pg_ctl/pg_rewind tests vs. slow AIX buildfarm members  (Andrew Dunstan <andrew@dunslane.net>)
Re: pg_ctl/pg_rewind tests vs. slow AIX buildfarm members  (Noah Misch <noah@leadboat.com>)
Re: pg_ctl/pg_rewind tests vs. slow AIX buildfarm members  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
I wrote:
> Andres Freund <andres@anarazel.de> writes:
>> I'don't like adding a couple seconds of test runtime for the benefit of
>> very slow platforms.

> Me either.  This is the first time I've seen an indication that the
> start_postmaster change mentioned in the comment is actually important
> for production use, rather than just being cleanup.  I think we ought
> to just fix it.  I'm willing to take care of the Unix side if someone
> will explain how to change the Windows side.

Attached is a draft patch for this.  I think it's fine for Unix (unless
someone wants to object to relying on "/bin/sh -c"), but I have no idea
whether it works for Windows.  The main risk is that if CMD.EXE runs
the postmaster as a subprocess rather than overlaying itself a la shell
"exec", the PID we'd get back would apply only to CMD.EXE not to the
eventual postmaster.  If so, I do not know how to fix that, or whether
it's fixable at all.

Note that this makes the test case in question fail reliably, which is
reasonable behavior IMO so I just changed the test.

If this does (or can be made to) work on Windows, I'd propose applying
it back to 9.2, where the current coding came in.

            regards, tom lane

diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index 6a36d29..f0025d7 100644
*** a/src/bin/pg_ctl/pg_ctl.c
--- b/src/bin/pg_ctl/pg_ctl.c
***************
*** 28,33 ****
--- 28,34 ----
  #include <time.h>
  #include <sys/types.h>
  #include <sys/stat.h>
+ #include <sys/wait.h>
  #include <unistd.h>

  #ifdef HAVE_SYS_RESOURCE_H
*************** static int    CreateRestrictedProcess(char
*** 153,162 ****
  static pgpid_t get_pgpid(bool is_status_request);
  static char **readfile(const char *path);
  static void free_readfile(char **optlines);
! static int    start_postmaster(void);
  static void read_post_opts(void);

! static PGPing test_postmaster_connection(bool);
  static bool postmaster_is_alive(pid_t pid);

  #if defined(HAVE_GETRLIMIT) && defined(RLIMIT_CORE)
--- 154,163 ----
  static pgpid_t get_pgpid(bool is_status_request);
  static char **readfile(const char *path);
  static void free_readfile(char **optlines);
! static pgpid_t start_postmaster(void);
  static void read_post_opts(void);

! static PGPing test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint);
  static bool postmaster_is_alive(pid_t pid);

  #if defined(HAVE_GETRLIMIT) && defined(RLIMIT_CORE)
*************** free_readfile(char **optlines)
*** 419,454 ****
   * start/test/stop routines
   */

! static int
  start_postmaster(void)
  {
      char        cmd[MAXPGPATH];

  #ifndef WIN32

      /*
       * Since there might be quotes to handle here, it is easier simply to pass
       * everything to a shell to process them.
-      *
-      * XXX it would be better to fork and exec so that we would know the child
-      * postmaster's PID directly; then test_postmaster_connection could use
-      * the PID without having to rely on reading it back from the pidfile.
       */
      if (log_file != NULL)
!         snprintf(cmd, MAXPGPATH, "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &",
                   exec_path, pgdata_opt, post_opts,
                   DEVNULL, log_file);
      else
!         snprintf(cmd, MAXPGPATH, "\"%s\" %s%s < \"%s\" 2>&1 &",
                   exec_path, pgdata_opt, post_opts, DEVNULL);

!     return system(cmd);
  #else                            /* WIN32 */

      /*
!      * On win32 we don't use system(). So we don't need to use & (which would
!      * be START /B on win32). However, we still call the shell (CMD.EXE) with
!      * it to handle redirection etc.
       */
      PROCESS_INFORMATION pi;

--- 420,479 ----
   * start/test/stop routines
   */

! static pgpid_t
  start_postmaster(void)
  {
      char        cmd[MAXPGPATH];

  #ifndef WIN32
+     pgpid_t        pm_pid;
+
+     /* Flush stdio channels just before fork, to avoid double-output problems */
+     fflush(stdout);
+     fflush(stderr);
+
+     pm_pid = fork();
+     if (pm_pid < 0)
+     {
+         /* fork failed */
+         write_stderr(_("%s: could not start server: %s\n"),
+                      progname, strerror(errno));
+         exit(1);
+     }
+     if (pm_pid > 0)
+     {
+         /* fork succeeded, in parent */
+         return pm_pid;
+     }
+
+     /* fork succeeded, in child */

      /*
       * Since there might be quotes to handle here, it is easier simply to pass
       * everything to a shell to process them.
       */
      if (log_file != NULL)
!         snprintf(cmd, MAXPGPATH, "exec \"%s\" %s%s < \"%s\" >> \"%s\" 2>&1",
                   exec_path, pgdata_opt, post_opts,
                   DEVNULL, log_file);
      else
!         snprintf(cmd, MAXPGPATH, "exec \"%s\" %s%s < \"%s\" 2>&1",
                   exec_path, pgdata_opt, post_opts, DEVNULL);

!     (void) execl("/bin/sh", "/bin/sh", "-c", cmd, (char *) NULL);
!
!     /* exec failed */
!     write_stderr(_("%s: could not start server: %s\n"),
!                  progname, strerror(errno));
!     exit(1);
!
!     return 0;                    /* keep dumb compilers quiet */
!
  #else                            /* WIN32 */

      /*
!      * As with the Unix case, it's easiest to use the shell (CMD.EXE) to
!      * handle redirection etc.
       */
      PROCESS_INFORMATION pi;

*************** start_postmaster(void)
*** 460,469 ****
                   exec_path, pgdata_opt, post_opts, DEVNULL);

      if (!CreateRestrictedProcess(cmd, &pi, false))
!         return GetLastError();
      CloseHandle(pi.hProcess);
      CloseHandle(pi.hThread);
!     return 0;
  #endif   /* WIN32 */
  }

--- 485,498 ----
                   exec_path, pgdata_opt, post_opts, DEVNULL);

      if (!CreateRestrictedProcess(cmd, &pi, false))
!     {
!         write_stderr(_("%s: could not start server: error code %lu\n"),
!                      progname, (unsigned long) GetLastError());
!         exit(1);
!     }
      CloseHandle(pi.hProcess);
      CloseHandle(pi.hThread);
!     return pi.dwProcessId;
  #endif   /* WIN32 */
  }

*************** start_postmaster(void)
*** 476,486 ****
   * manager checkpoint, it's got nothing to do with database checkpoints!!
   */
  static PGPing
! test_postmaster_connection(bool do_checkpoint)
  {
      PGPing        ret = PQPING_NO_RESPONSE;
-     bool        found_stale_pidfile = false;
-     pgpid_t        pm_pid = 0;
      char        connstr[MAXPGPATH * 2 + 256];
      int            i;

--- 505,513 ----
   * manager checkpoint, it's got nothing to do with database checkpoints!!
   */
  static PGPing
! test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
  {
      PGPing        ret = PQPING_NO_RESPONSE;
      char        connstr[MAXPGPATH * 2 + 256];
      int            i;

*************** test_postmaster_connection(bool do_check
*** 535,563 ****
                           optlines[5] != NULL)
                  {
                      /* File is complete enough for us, parse it */
!                     long        pmpid;
                      time_t        pmstart;

                      /*
!                      * Make sanity checks.  If it's for a standalone backend
!                      * (negative PID), or the recorded start time is before
!                      * pg_ctl started, then either we are looking at the wrong
!                      * data directory, or this is a pre-existing pidfile that
!                      * hasn't (yet?) been overwritten by our child postmaster.
!                      * Allow 2 seconds slop for possible cross-process clock
!                      * skew.
                       */
                      pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
                      pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
!                     if (pmpid <= 0 || pmstart < start_time - 2)
!                     {
!                         /*
!                          * Set flag to report stale pidfile if it doesn't get
!                          * overwritten before we give up waiting.
!                          */
!                         found_stale_pidfile = true;
!                     }
!                     else
                      {
                          /*
                           * OK, seems to be a valid pidfile from our child.
--- 562,581 ----
                           optlines[5] != NULL)
                  {
                      /* File is complete enough for us, parse it */
!                     pgpid_t        pmpid;
                      time_t        pmstart;

                      /*
!                      * Make sanity checks.  If it's for the wrong PID, or the
!                      * recorded start time is before pg_ctl started, then
!                      * either we are looking at the wrong data directory, or
!                      * this is a pre-existing pidfile that hasn't (yet?) been
!                      * overwritten by our child postmaster.  Allow 2 seconds
!                      * slop for possible cross-process clock skew.
                       */
                      pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
                      pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
!                     if (pmpid == pm_pid && pmstart >= start_time - 2)
                      {
                          /*
                           * OK, seems to be a valid pidfile from our child.
*************** test_postmaster_connection(bool do_check
*** 567,575 ****
                          char       *hostaddr;
                          char        host_str[MAXPGPATH];

-                         found_stale_pidfile = false;
-                         pm_pid = (pgpid_t) pmpid;
-
                          /*
                           * Extract port number and host string to use. Prefer
                           * using Unix socket if available.
--- 585,590 ----
*************** test_postmaster_connection(bool do_check
*** 635,675 ****
          }

          /*
!          * The postmaster should create postmaster.pid very soon after being
!          * started.  If it's not there after we've waited 5 or more seconds,
!          * assume startup failed and give up waiting.  (Note this covers both
!          * cases where the pidfile was never created, and where it was created
!          * and then removed during postmaster exit.)  Also, if there *is* a
!          * file there but it appears stale, issue a suitable warning and give
!          * up waiting.
           */
!         if (i >= 5)
          {
!             struct stat statbuf;
!
!             if (stat(pid_file, &statbuf) != 0)
!             {
!                 if (errno != ENOENT)
!                     write_stderr(_("\n%s: could not stat file \"%s\": %s\n"),
!                                  progname, pid_file, strerror(errno));
!                 return PQPING_NO_RESPONSE;
!             }

!             if (found_stale_pidfile)
!             {
!                 write_stderr(_("\n%s: this data directory appears to be running a pre-existing postmaster\n"),
!                              progname);
!                 return PQPING_NO_RESPONSE;
!             }
          }

          /*
!          * If we've been able to identify the child postmaster's PID, check
!          * the process is still alive.  This covers cases where the postmaster
!          * successfully created the pidfile but then crashed without removing
!          * it.
           */
!         if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid))
              return PQPING_NO_RESPONSE;

          /* No response, or startup still in process; wait */
--- 650,671 ----
          }

          /*
!          * Reap child process if it died; if it remains in zombie state then
!          * our kill-based test will think it's still running.
           */
! #ifndef WIN32
          {
!             int            exitstatus;

!             (void) waitpid((pid_t) pm_pid, &exitstatus, WNOHANG);
          }
+ #endif

          /*
!          * Check whether the child postmaster process is still alive.  This
!          * lets us exit early if the postmaster fails during startup.
           */
!         if (!postmaster_is_alive((pid_t) pm_pid))
              return PQPING_NO_RESPONSE;

          /* No response, or startup still in process; wait */
*************** static void
*** 836,842 ****
  do_start(void)
  {
      pgpid_t        old_pid = 0;
!     int            exitcode;

      if (ctl_command != RESTART_COMMAND)
      {
--- 832,838 ----
  do_start(void)
  {
      pgpid_t        old_pid = 0;
!     pgpid_t        pm_pid;

      if (ctl_command != RESTART_COMMAND)
      {
*************** do_start(void)
*** 876,894 ****
      }
  #endif

!     exitcode = start_postmaster();
!     if (exitcode != 0)
!     {
!         write_stderr(_("%s: could not start server: exit code was %d\n"),
!                      progname, exitcode);
!         exit(1);
!     }

      if (do_wait)
      {
          print_msg(_("waiting for server to start..."));

!         switch (test_postmaster_connection(false))
          {
              case PQPING_OK:
                  print_msg(_(" done\n"));
--- 872,884 ----
      }
  #endif

!     pm_pid = start_postmaster();

      if (do_wait)
      {
          print_msg(_("waiting for server to start..."));

!         switch (test_postmaster_connection(pm_pid, false))
          {
              case PQPING_OK:
                  print_msg(_(" done\n"));
*************** pgwin32_ServiceMain(DWORD argc, LPTSTR *
*** 1585,1591 ****
      if (do_wait)
      {
          write_eventlog(EVENTLOG_INFORMATION_TYPE, _("Waiting for server startup...\n"));
!         if (test_postmaster_connection(true) != PQPING_OK)
          {
              write_eventlog(EVENTLOG_ERROR_TYPE, _("Timed out waiting for server startup\n"));
              pgwin32_SetServiceStatus(SERVICE_STOPPED);
--- 1575,1581 ----
      if (do_wait)
      {
          write_eventlog(EVENTLOG_INFORMATION_TYPE, _("Waiting for server startup...\n"));
!         if (test_postmaster_connection(postmasterPID, true) != PQPING_OK)
          {
              write_eventlog(EVENTLOG_ERROR_TYPE, _("Timed out waiting for server startup\n"));
              pgwin32_SetServiceStatus(SERVICE_STOPPED);
*************** pgwin32_ServiceMain(DWORD argc, LPTSTR *
*** 1606,1613 ****
              {
                  /*
                   * status.dwCheckPoint can be incremented by
!                  * test_postmaster_connection(true), so it might not start
!                  * from 0.
                   */
                  int            maxShutdownCheckPoint = status.dwCheckPoint + 12;;

--- 1596,1602 ----
              {
                  /*
                   * status.dwCheckPoint can be incremented by
!                  * test_postmaster_connection(), so it might not start from 0.
                   */
                  int            maxShutdownCheckPoint = status.dwCheckPoint + 12;;

diff --git a/src/bin/pg_ctl/t/001_start_stop.pl b/src/bin/pg_ctl/t/001_start_stop.pl
index dae47a8..2f93aa0 100644
*** a/src/bin/pg_ctl/t/001_start_stop.pl
--- b/src/bin/pg_ctl/t/001_start_stop.pl
*************** else
*** 32,39 ****
  close CONF;
  command_ok([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ],
      'pg_ctl start -w');
! command_ok([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ],
!     'second pg_ctl start succeeds');
  command_ok([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ],
      'pg_ctl stop -w');
  command_fails([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ],
--- 32,39 ----
  close CONF;
  command_ok([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ],
      'pg_ctl start -w');
! command_fails([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ],
!     'second pg_ctl start fails');
  command_ok([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ],
      'pg_ctl stop -w');
  command_fails([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ],

pgsql-hackers by date:

Previous
From: Greg Stark
Date:
Subject: PG_CATCH used without PG_RETHROW
Next
From: Tom Lane
Date:
Subject: Re: PG_CATCH used without PG_RETHROW