]> granicus.if.org Git - postgresql/commitdiff
Fix postmaster's handling of a startup-process crash.
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Jul 2015 17:22:23 +0000 (13:22 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Jul 2015 17:22:23 +0000 (13:22 -0400)
Ordinarily, a failure (unexpected exit status) of the startup subprocess
should be considered fatal, so the postmaster should just close up shop
and quit.  However, if we sent the startup process a SIGQUIT or SIGKILL
signal, the failure is hardly "unexpected", and we should attempt restart;
this is necessary for recovery from ordinary backend crashes in hot-standby
scenarios.  I attempted to implement the latter rule with a two-line patch
in commit 442231d7f71764b8c628044e7ce2225f9aa43b67, but it now emerges that
that patch was a few bricks shy of a load: it failed to distinguish the
case of a signaled startup process from the case where the new startup
process crashes before reaching database consistency.  That resulted in
infinitely respawning a new startup process only to have it crash again.

To handle this properly, we really must track whether we have sent the
*current* startup process a kill signal.  Rather than add yet another
ad-hoc boolean to the postmaster's state, I chose to unify this with the
existing RecoveryError flag into an enum tracking the startup process's
state.  That seems more consistent with the postmaster's general state
machine design.

Back-patch to 9.0, like the previous patch.

src/backend/postmaster/postmaster.c

index e4513d14edb52d8bbcb81da2b0f70ff4d66b207f..8c4a543bf4b7c652af3cbaf8e11b30e70f012f71 100644 (file)
@@ -218,6 +218,17 @@ static pid_t StartupPID = 0,
                        PgStatPID = 0,
                        SysLoggerPID = 0;
 
+/* Startup process's status */
+typedef enum
+{
+       STARTUP_NOT_RUNNING,
+       STARTUP_RUNNING,
+       STARTUP_SIGNALED,                       /* we sent it a SIGQUIT or SIGKILL */
+       STARTUP_CRASHED
+} StartupStatusEnum;
+
+static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
+
 /* Startup/shutdown state */
 #define                        NoShutdown              0
 #define                        SmartShutdown   1
@@ -226,7 +237,6 @@ static pid_t StartupPID = 0,
 static int     Shutdown = NoShutdown;
 
 static bool FatalError = false; /* T if recovering from backend crash */
-static bool RecoveryError = false;             /* T if WAL recovery failed */
 
 /*
  * We use a simple state machine to control startup, shutdown, and
@@ -269,8 +279,6 @@ static bool RecoveryError = false;          /* T if WAL recovery failed */
  * states, nor in PM_SHUTDOWN states (because we don't enter those states
  * when trying to recover from a crash).  It can be true in PM_STARTUP state,
  * because we don't clear it until we've successfully started WAL redo.
- * Similarly, RecoveryError means that we have crashed during recovery, and
- * should not try to restart.
  */
 typedef enum
 {
@@ -1115,6 +1123,7 @@ PostmasterMain(int argc, char *argv[])
         */
        StartupPID = StartupDataBase();
        Assert(StartupPID != 0);
+       StartupStatus = STARTUP_RUNNING;
        pmState = PM_STARTUP;
 
        status = ServerLoop();
@@ -2381,6 +2390,7 @@ reaper(SIGNAL_ARGS)
                        if (Shutdown > NoShutdown &&
                                (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
                        {
+                               StartupStatus = STARTUP_NOT_RUNNING;
                                pmState = PM_WAIT_BACKENDS;
                                /* PostmasterStateMachine logic does the rest */
                                continue;
@@ -2403,16 +2413,18 @@ reaper(SIGNAL_ARGS)
                        /*
                         * After PM_STARTUP, any unexpected exit (including FATAL exit) of
                         * the startup process is catastrophic, so kill other children,
-                        * and set RecoveryError so we don't try to reinitialize after
-                        * they're gone.  Exception: if FatalError is already set, that
-                        * implies we previously sent the startup process a SIGQUIT, so
+                        * and set StartupStatus so we don't try to reinitialize after
+                        * they're gone.  Exception: if StartupStatus is STARTUP_SIGNALED,
+                        * then we previously sent the startup process a SIGQUIT; so
                         * that's probably the reason it died, and we do want to try to
                         * restart in that case.
                         */
                        if (!EXIT_STATUS_0(exitstatus))
                        {
-                               if (!FatalError)
-                                       RecoveryError = true;
+                               if (StartupStatus == STARTUP_SIGNALED)
+                                       StartupStatus = STARTUP_NOT_RUNNING;
+                               else
+                                       StartupStatus = STARTUP_CRASHED;
                                HandleChildCrash(pid, exitstatus,
                                                                 _("startup process"));
                                continue;
@@ -2421,6 +2433,7 @@ reaper(SIGNAL_ARGS)
                        /*
                         * Startup succeeded, commence normal operations
                         */
+                       StartupStatus = STARTUP_NOT_RUNNING;
                        FatalError = false;
                        ReachedNormalRunning = true;
                        pmState = PM_RUN;
@@ -2756,7 +2769,10 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 
        /* Take care of the startup process too */
        if (pid == StartupPID)
+       {
                StartupPID = 0;
+               StartupStatus = STARTUP_CRASHED;
+       }
        else if (StartupPID != 0 && !FatalError)
        {
                ereport(DEBUG2,
@@ -2764,6 +2780,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
                                                                 (SendStop ? "SIGSTOP" : "SIGQUIT"),
                                                                 (int) StartupPID)));
                signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
+               StartupStatus = STARTUP_SIGNALED;
        }
 
        /* Take care of the bgwriter too */
@@ -3110,12 +3127,12 @@ PostmasterStateMachine(void)
        }
 
        /*
-        * If recovery failed, wait for all non-syslogger children to exit, and
-        * then exit postmaster. We don't try to reinitialize when recovery fails,
-        * because more than likely it will just fail again and we will keep
-        * trying forever.
+        * If the startup process failed, wait for all non-syslogger children to
+        * exit, and then exit postmaster.  We don't try to reinitialize when the
+        * startup process fails, because more than likely it will just fail again
+        * and we will keep trying forever.
         */
-       if (RecoveryError && pmState == PM_NO_CHILDREN)
+       if (pmState == PM_NO_CHILDREN && StartupStatus == STARTUP_CRASHED)
                ExitPostmaster(1);
 
        /*
@@ -3132,6 +3149,7 @@ PostmasterStateMachine(void)
 
                StartupPID = StartupDataBase();
                Assert(StartupPID != 0);
+               StartupStatus = STARTUP_RUNNING;
                pmState = PM_STARTUP;
        }
 }