Fix "pg_ctl start -w" to test child process status directly.

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 12 Oct 2015 22:30:36 +0000 (18:30 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 12 Oct 2015 22:30:36 +0000 (18:30 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 12 Oct 2015 22:30:36 +0000 (18:30 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 12 Oct 2015 22:30:36 +0000 (18:30 -0400)
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c

index ba189e7288416ed032c858a0c4ac046210b337ac..dacdfef323c07b5d5c984f021e4f3318be1383ce 100644 (file)
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -28,6 +28,7 @@
  #include <time.h>
  #include <sys/types.h>
  #include <sys/stat.h>
+#include <sys/wait.h>
  #include <unistd.h>
  
  #ifdef HAVE_SYS_RESOURCE_H
@@ -153,10 +154,10 @@ static int        CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo,
  static pgpid_t get_pgpid(bool is_status_request);
  static char **readfile(const char *path);
  static void free_readfile(char **optlines);
-static int     start_postmaster(void);
+static pgpid_t start_postmaster(void);
  static void read_post_opts(void);
  
-static PGPing test_postmaster_connection(bool);
+static PGPing test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint);
  static bool postmaster_is_alive(pid_t pid);
  
  #if defined(HAVE_GETRLIMIT) && defined(RLIMIT_CORE)
@@ -419,36 +420,73 @@ free_readfile(char **optlines)
   * start/test/stop routines
   */
  
-static int
+/*
+ * Start the postmaster and return its PID.
+ *
+ * Currently, on Windows what we return is the PID of the shell process
+ * that launched the postmaster (and, we trust, is waiting for it to exit).
+ * So the PID is usable for "is the postmaster still running" checks,
+ * but cannot be compared directly to postmaster.pid.
+ *
+ * On Windows, we also save aside a handle to the shell process in
+ * "postmasterProcess", which the caller should close when done with it.
+ */
+static pgpid_t
  start_postmaster(void)
  {
         char            cmd[MAXPGPATH];
  
  #ifndef WIN32
+       pgpid_t         pm_pid;
+
+       /* Flush stdio channels just before fork, to avoid double-output problems */
+       fflush(stdout);
+       fflush(stderr);
+
+       pm_pid = fork();
+       if (pm_pid < 0)
+       {
+               /* fork failed */
+               write_stderr(_("%s: could not start server: %s\n"),
+                                        progname, strerror(errno));
+               exit(1);
+       }
+       if (pm_pid > 0)
+       {
+               /* fork succeeded, in parent */
+               return pm_pid;
+       }
+
+       /* fork succeeded, in child */
  
         /*
          * Since there might be quotes to handle here, it is easier simply to pass
-        * everything to a shell to process them.
-        *
-        * XXX it would be better to fork and exec so that we would know the child
-        * postmaster's PID directly; then test_postmaster_connection could use
-        * the PID without having to rely on reading it back from the pidfile.
+        * everything to a shell to process them.  Use exec so that the postmaster
+        * has the same PID as the current child process.
          */
         if (log_file != NULL)
-               snprintf(cmd, MAXPGPATH, "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &",
+               snprintf(cmd, MAXPGPATH, "exec \"%s\" %s%s < \"%s\" >> \"%s\" 2>&1",
                                  exec_path, pgdata_opt, post_opts,
                                  DEVNULL, log_file);
         else
-               snprintf(cmd, MAXPGPATH, "\"%s\" %s%s < \"%s\" 2>&1 &",
+               snprintf(cmd, MAXPGPATH, "exec \"%s\" %s%s < \"%s\" 2>&1",
                                  exec_path, pgdata_opt, post_opts, DEVNULL);
  
-       return system(cmd);
+       (void) execl("/bin/sh", "/bin/sh", "-c", cmd, (char *) NULL);
+
+       /* exec failed */
+       write_stderr(_("%s: could not start server: %s\n"),
+                                progname, strerror(errno));
+       exit(1);
+
+       return 0;                                       /* keep dumb compilers quiet */
+
  #else                                                  /* WIN32 */
  
         /*
-        * On win32 we don't use system(). So we don't need to use & (which would
-        * be START /B on win32). However, we still call the shell (CMD.EXE) with
-        * it to handle redirection etc.
+        * As with the Unix case, it's easiest to use the shell (CMD.EXE) to
+        * handle redirection etc.  Unfortunately CMD.EXE lacks any equivalent of
+        * "exec", so we don't get to find out the postmaster's PID immediately.
          */
         PROCESS_INFORMATION pi;
  
@@ -460,10 +498,15 @@ start_postmaster(void)
                                  exec_path, pgdata_opt, post_opts, DEVNULL);
  
         if (!CreateRestrictedProcess(cmd, &pi, false))
-               return GetLastError();
-       CloseHandle(pi.hProcess);
+       {
+               write_stderr(_("%s: could not start server: error code %lu\n"),
+                                        progname, (unsigned long) GetLastError());
+               exit(1);
+       }
+       /* Don't close command process handle here; caller must do so */
+       postmasterProcess = pi.hProcess;
         CloseHandle(pi.hThread);
-       return 0;
+       return pi.dwProcessId;          /* Shell's PID, not postmaster's! */
  #endif   /* WIN32 */
  }
  
@@ -472,15 +515,21 @@ start_postmaster(void)
  /*
   * Find the pgport and try a connection
   *
+ * On Unix, pm_pid is the PID of the just-launched postmaster.  On Windows,
+ * it may be the PID of an ancestor shell process, so we can't check the
+ * contents of postmaster.pid quite as carefully.
+ *
+ * On Windows, the static variable postmasterProcess is an implicit argument
+ * to this routine; it contains a handle to the postmaster process or an
+ * ancestor shell process thereof.
+ *
   * Note that the checkpoint parameter enables a Windows service control
   * manager checkpoint, it's got nothing to do with database checkpoints!!
   */
  static PGPing
-test_postmaster_connection(bool do_checkpoint)
+test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
  {
         PGPing          ret = PQPING_NO_RESPONSE;
-       bool            found_stale_pidfile = false;
-       pgpid_t         pm_pid = 0;
         char            connstr[MAXPGPATH * 2 + 256];
         int                     i;
  
@@ -535,29 +584,27 @@ test_postmaster_connection(bool do_checkpoint)
                                                  optlines[5] != NULL)
                                 {
                                         /* File is complete enough for us, parse it */
-                                       long            pmpid;
+                                       pgpid_t         pmpid;
                                         time_t          pmstart;
  
                                         /*
-                                        * Make sanity checks.  If it's for a standalone backend
-                                        * (negative PID), or the recorded start time is before
-                                        * pg_ctl started, then either we are looking at the wrong
-                                        * data directory, or this is a pre-existing pidfile that
-                                        * hasn't (yet?) been overwritten by our child postmaster.
-                                        * Allow 2 seconds slop for possible cross-process clock
-                                        * skew.
+                                        * Make sanity checks.  If it's for the wrong PID, or the
+                                        * recorded start time is before pg_ctl started, then
+                                        * either we are looking at the wrong data directory, or
+                                        * this is a pre-existing pidfile that hasn't (yet?) been
+                                        * overwritten by our child postmaster.  Allow 2 seconds
+                                        * slop for possible cross-process clock skew.
                                          */
                                         pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
                                         pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
-                                       if (pmpid <= 0 || pmstart < start_time - 2)
-                                       {
-                                               /*
-                                                * Set flag to report stale pidfile if it doesn't get
-                                                * overwritten before we give up waiting.
-                                                */
-                                               found_stale_pidfile = true;
-                                       }
-                                       else
+                                       if (pmstart >= start_time - 2 &&
+#ifndef WIN32
+                                               pmpid == pm_pid
+#else
+                                       /* Windows can only reject standalone-backend PIDs */
+                                               pmpid > 0
+#endif
+                                               )
                                         {
                                                 /*
                                                  * OK, seems to be a valid pidfile from our child.
@@ -567,9 +614,6 @@ test_postmaster_connection(bool do_checkpoint)
                                                 char       *hostaddr;
                                                 char            host_str[MAXPGPATH];
  
-                                               found_stale_pidfile = false;
-                                               pm_pid = (pgpid_t) pmpid;
-
                                                 /*
                                                  * Extract port number and host string to use. Prefer
                                                  * using Unix socket if available.
@@ -635,42 +679,23 @@ test_postmaster_connection(bool do_checkpoint)
                 }
  
                 /*
-                * The postmaster should create postmaster.pid very soon after being
-                * started.  If it's not there after we've waited 5 or more seconds,
-                * assume startup failed and give up waiting.  (Note this covers both
-                * cases where the pidfile was never created, and where it was created
-                * and then removed during postmaster exit.)  Also, if there *is* a
-                * file there but it appears stale, issue a suitable warning and give
-                * up waiting.
+                * Check whether the child postmaster process is still alive.  This
+                * lets us exit early if the postmaster fails during startup.
+                *
+                * On Windows, we may be checking the postmaster's parent shell, but
+                * that's fine for this purpose.
                  */
-               if (i >= 5)
+#ifndef WIN32
                 {
-                       struct stat statbuf;
+                       int                     exitstatus;
  
-                       if (stat(pid_file, &statbuf) != 0)
-                       {
-                               if (errno != ENOENT)
-                                       write_stderr(_("\n%s: could not stat file \"%s\": %s\n"),
-                                                                progname, pid_file, strerror(errno));
-                               return PQPING_NO_RESPONSE;
-                       }
-
-                       if (found_stale_pidfile)
-                       {
-                               write_stderr(_("\n%s: this data directory appears to be running a pre-existing postmaster\n"),
-                                                        progname);
+                       if (waitpid((pid_t) pm_pid, &exitstatus, WNOHANG) == (pid_t) pm_pid)
                                 return PQPING_NO_RESPONSE;
-                       }
                 }
-
-               /*
-                * If we've been able to identify the child postmaster's PID, check
-                * the process is still alive.  This covers cases where the postmaster
-                * successfully created the pidfile but then crashed without removing
-                * it.
-                */
-               if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid))
+#else
+               if (WaitForSingleObject(postmasterProcess, 0) == WAIT_OBJECT_0)
                         return PQPING_NO_RESPONSE;
+#endif
  
                 /* No response, or startup still in process; wait */
  #if defined(WIN32)
@@ -836,7 +861,7 @@ static void
  do_start(void)
  {
         pgpid_t         old_pid = 0;
-       int                     exitcode;
+       pgpid_t         pm_pid;
  
         if (ctl_command != RESTART_COMMAND)
         {
@@ -876,19 +901,13 @@ do_start(void)
         }
  #endif
  
-       exitcode = start_postmaster();
-       if (exitcode != 0)
-       {
-               write_stderr(_("%s: could not start server: exit code was %d\n"),
-                                        progname, exitcode);
-               exit(1);
-       }
+       pm_pid = start_postmaster();
  
         if (do_wait)
         {
                 print_msg(_("waiting for server to start..."));
  
-               switch (test_postmaster_connection(false))
+               switch (test_postmaster_connection(pm_pid, false))
                 {
                         case PQPING_OK:
                                 print_msg(_(" done\n"));
@@ -914,6 +933,12 @@ do_start(void)
         }
         else
                 print_msg(_("server starting\n"));
+
+#ifdef WIN32
+       /* Now we don't need the handle to the shell process anymore */
+       CloseHandle(postmasterProcess);
+       postmasterProcess = INVALID_HANDLE_VALUE;
+#endif
  }
  
  
@@ -1585,7 +1610,7 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
         if (do_wait)
         {
                 write_eventlog(EVENTLOG_INFORMATION_TYPE, _("Waiting for server startup...\n"));
-               if (test_postmaster_connection(true) != PQPING_OK)
+               if (test_postmaster_connection(postmasterPID, true) != PQPING_OK)
                 {
                         write_eventlog(EVENTLOG_ERROR_TYPE, _("Timed out waiting for server startup\n"));
                         pgwin32_SetServiceStatus(SERVICE_STOPPED);
@@ -1606,10 +1631,9 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
                         {
                                 /*
                                  * status.dwCheckPoint can be incremented by
-                                * test_postmaster_connection(true), so it might not start
-                                * from 0.
+                                * test_postmaster_connection(), so it might not start from 0.
                                  */
-                               int                     maxShutdownCheckPoint = status.dwCheckPoint + 12;;
+                               int                     maxShutdownCheckPoint = status.dwCheckPoint + 12;
  
                                 kill(postmasterPID, SIGINT);
  
diff --git a/src/bin/pg_ctl/t/001_start_stop.pl b/src/bin/pg_ctl/t/001_start_stop.pl

index 177a16f7afe80e83133ed2bd5b91adb88fef860d..f57abcef1dc150c135c1fb187a3fb6faf030585f 100644 (file)
--- a/src/bin/pg_ctl/t/001_start_stop.pl
+++ b/src/bin/pg_ctl/t/001_start_stop.pl
@@ -34,8 +34,13 @@ else
  close CONF;
  command_ok([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ],
         'pg_ctl start -w');
-command_ok([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ],
-       'second pg_ctl start succeeds');
+# sleep here is because Windows builds can't check postmaster.pid exactly,
+# so they may mistake a pre-existing postmaster.pid for one created by the
+# postmaster they start.  Waiting more than the 2 seconds slop time allowed
+# by test_postmaster_connection prevents that mistake.
+sleep 3 if ($windows_os);
+command_fails([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ],
+       'second pg_ctl start -w fails');
  command_ok([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ],
         'pg_ctl stop -w');
  command_fails([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ],
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 12 Oct 2015 22:30:36 +0000 (18:30 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 12 Oct 2015 22:30:36 +0000 (18:30 -0400)
src/bin/pg_ctl/pg_ctl.c		patch \| blob \| history
src/bin/pg_ctl/t/001_start_stop.pl		patch \| blob \| history