Fix postmaster's handling of a startup-process crash.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 9 Jul 2015 17:22:22 +0000 (13:22 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 9 Jul 2015 17:22:22 +0000 (13:22 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Jul 2015 17:22:22 +0000 (13:22 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Jul 2015 17:22:22 +0000 (13:22 -0400)
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c

index df8037b498d489648c15c1ffde8b35fd33d7c460..1bb3138a03ab0514e265edb2d792397aa9ac38cc 100644 (file)
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -249,6 +249,17 @@ static pid_t StartupPID = 0,
                         PgStatPID = 0,
                         SysLoggerPID = 0;
  
+/* Startup process's status */
+typedef enum
+{
+       STARTUP_NOT_RUNNING,
+       STARTUP_RUNNING,
+       STARTUP_SIGNALED,                       /* we sent it a SIGQUIT or SIGKILL */
+       STARTUP_CRASHED
+} StartupStatusEnum;
+
+static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
+
  /* Startup/shutdown state */
  #define                        NoShutdown              0
  #define                        SmartShutdown   1
@@ -258,7 +269,6 @@ static pid_t StartupPID = 0,
  static int     Shutdown = NoShutdown;
  
  static bool FatalError = false; /* T if recovering from backend crash */
-static bool RecoveryError = false;             /* T if WAL recovery failed */
  
  /*
   * We use a simple state machine to control startup, shutdown, and
@@ -301,8 +311,6 @@ static bool RecoveryError = false;          /* T if WAL recovery failed */
   * states, nor in PM_SHUTDOWN states (because we don't enter those states
   * when trying to recover from a crash).  It can be true in PM_STARTUP state,
   * because we don't clear it until we've successfully started WAL redo.
- * Similarly, RecoveryError means that we have crashed during recovery, and
- * should not try to restart.
   */
  typedef enum
  {
@@ -1246,6 +1254,7 @@ PostmasterMain(int argc, char *argv[])
          */
         StartupPID = StartupDataBase();
         Assert(StartupPID != 0);
+       StartupStatus = STARTUP_RUNNING;
         pmState = PM_STARTUP;
  
         /* Some workers may be scheduled to start now */
@@ -1666,7 +1675,7 @@ ServerLoop(void)
  
                 /* If we have lost the archiver, try to start a new one. */
                 if (PgArchPID == 0 && PgArchStartupAllowed())
-                               PgArchPID = pgarch_start();
+                       PgArchPID = pgarch_start();
  
                 /* If we need to signal the autovacuum launcher, do so now */
                 if (avlauncher_needs_signal)
@@ -2591,6 +2600,7 @@ reaper(SIGNAL_ARGS)
                         if (Shutdown > NoShutdown &&
                                 (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
                         {
+                               StartupStatus = STARTUP_NOT_RUNNING;
                                 pmState = PM_WAIT_BACKENDS;
                                 /* PostmasterStateMachine logic does the rest */
                                 continue;
@@ -2600,6 +2610,7 @@ reaper(SIGNAL_ARGS)
                         {
                                 ereport(LOG,
                                                 (errmsg("shutdown at recovery target")));
+                               StartupStatus = STARTUP_NOT_RUNNING;
                                 Shutdown = SmartShutdown;
                                 TerminateChildren(SIGTERM);
                                 pmState = PM_WAIT_BACKENDS;
@@ -2624,16 +2635,18 @@ reaper(SIGNAL_ARGS)
                         /*
                          * After PM_STARTUP, any unexpected exit (including FATAL exit) of
                          * the startup process is catastrophic, so kill other children,
-                        * and set RecoveryError so we don't try to reinitialize after
-                        * they're gone.  Exception: if FatalError is already set, that
-                        * implies we previously sent the startup process a SIGQUIT, so
+                        * and set StartupStatus so we don't try to reinitialize after
+                        * they're gone.  Exception: if StartupStatus is STARTUP_SIGNALED,
+                        * then we previously sent the startup process a SIGQUIT; so
                          * that's probably the reason it died, and we do want to try to
                          * restart in that case.
                          */
                         if (!EXIT_STATUS_0(exitstatus))
                         {
-                               if (!FatalError)
-                                       RecoveryError = true;
+                               if (StartupStatus == STARTUP_SIGNALED)
+                                       StartupStatus = STARTUP_NOT_RUNNING;
+                               else
+                                       StartupStatus = STARTUP_CRASHED;
                                 HandleChildCrash(pid, exitstatus,
                                                                  _("startup process"));
                                 continue;
@@ -2642,6 +2655,7 @@ reaper(SIGNAL_ARGS)
                         /*
                          * Startup succeeded, commence normal operations
                          */
+                       StartupStatus = STARTUP_NOT_RUNNING;
                         FatalError = false;
                         Assert(AbortStartTime == 0);
                         ReachedNormalRunning = true;
@@ -2962,7 +2976,7 @@ CleanupBackgroundWorker(int pid,
                 ReportBackgroundWorkerPID(rw);  /* report child death */
  
                 LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG,
-                                                           namebuf, pid, exitstatus);
+                                        namebuf, pid, exitstatus);
  
                 return true;
         }
@@ -3190,7 +3204,10 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
  
         /* Take care of the startup process too */
         if (pid == StartupPID)
+       {
                 StartupPID = 0;
+               StartupStatus = STARTUP_CRASHED;
+       }
         else if (StartupPID != 0 && take_action)
         {
                 ereport(DEBUG2,
@@ -3198,6 +3215,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
                                                                  (SendStop ? "SIGSTOP" : "SIGQUIT"),
                                                                  (int) StartupPID)));
                 signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
+               StartupStatus = STARTUP_SIGNALED;
         }
  
         /* Take care of the bgwriter too */
@@ -3589,13 +3607,14 @@ PostmasterStateMachine(void)
         }
  
         /*
-        * If recovery failed, or the user does not want an automatic restart
-        * after backend crashes, wait for all non-syslogger children to exit, and
-        * then exit postmaster. We don't try to reinitialize when recovery fails,
-        * because more than likely it will just fail again and we will keep
-        * trying forever.
+        * If the startup process failed, or the user does not want an automatic
+        * restart after backend crashes, wait for all non-syslogger children to
+        * exit, and then exit postmaster.  We don't try to reinitialize when the
+        * startup process fails, because more than likely it will just fail again
+        * and we will keep trying forever.
          */
-       if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash))
+       if (pmState == PM_NO_CHILDREN &&
+               (StartupStatus == STARTUP_CRASHED || !restart_after_crash))
                 ExitPostmaster(1);
  
         /*
@@ -3615,6 +3634,7 @@ PostmasterStateMachine(void)
  
                 StartupPID = StartupDataBase();
                 Assert(StartupPID != 0);
+               StartupStatus = STARTUP_RUNNING;
                 pmState = PM_STARTUP;
                 /* crash recovery started, reset SIGKILL flag */
                 AbortStartTime = 0;
@@ -3746,7 +3766,11 @@ TerminateChildren(int signal)
  {
         SignalChildren(signal);
         if (StartupPID != 0)
+       {
                 signal_child(StartupPID, signal);
+               if (signal == SIGQUIT || signal == SIGKILL)
+                       StartupStatus = STARTUP_SIGNALED;
+       }
         if (BgWriterPID != 0)
                 signal_child(BgWriterPID, signal);
         if (CheckpointerPID != 0)
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 9 Jul 2015 17:22:22 +0000 (13:22 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 9 Jul 2015 17:22:22 +0000 (13:22 -0400)