]> granicus.if.org Git - postgresql/commitdiff
Fix postmaster's handling of a startup-process crash.
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Jul 2015 17:22:23 +0000 (13:22 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Jul 2015 17:22:23 +0000 (13:22 -0400)
Ordinarily, a failure (unexpected exit status) of the startup subprocess
should be considered fatal, so the postmaster should just close up shop
and quit.  However, if we sent the startup process a SIGQUIT or SIGKILL
signal, the failure is hardly "unexpected", and we should attempt restart;
this is necessary for recovery from ordinary backend crashes in hot-standby
scenarios.  I attempted to implement the latter rule with a two-line patch
in commit 442231d7f71764b8c628044e7ce2225f9aa43b67, but it now emerges that
that patch was a few bricks shy of a load: it failed to distinguish the
case of a signaled startup process from the case where the new startup
process crashes before reaching database consistency.  That resulted in
infinitely respawning a new startup process only to have it crash again.

To handle this properly, we really must track whether we have sent the
*current* startup process a kill signal.  Rather than add yet another
ad-hoc boolean to the postmaster's state, I chose to unify this with the
existing RecoveryError flag into an enum tracking the startup process's
state.  That seems more consistent with the postmaster's general state
machine design.

Back-patch to 9.0, like the previous patch.

src/backend/postmaster/postmaster.c

index df8037b498d489648c15c1ffde8b35fd33d7c460..1bb3138a03ab0514e265edb2d792397aa9ac38cc 100644 (file)
@@ -249,6 +249,17 @@ static pid_t StartupPID = 0,
                        PgStatPID = 0,
                        SysLoggerPID = 0;
 
+/* Startup process's status */
+typedef enum
+{
+       STARTUP_NOT_RUNNING,
+       STARTUP_RUNNING,
+       STARTUP_SIGNALED,                       /* we sent it a SIGQUIT or SIGKILL */
+       STARTUP_CRASHED
+} StartupStatusEnum;
+
+static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
+
 /* Startup/shutdown state */
 #define                        NoShutdown              0
 #define                        SmartShutdown   1
@@ -258,7 +269,6 @@ static pid_t StartupPID = 0,
 static int     Shutdown = NoShutdown;
 
 static bool FatalError = false; /* T if recovering from backend crash */
-static bool RecoveryError = false;             /* T if WAL recovery failed */
 
 /*
  * We use a simple state machine to control startup, shutdown, and
@@ -301,8 +311,6 @@ static bool RecoveryError = false;          /* T if WAL recovery failed */
  * states, nor in PM_SHUTDOWN states (because we don't enter those states
  * when trying to recover from a crash).  It can be true in PM_STARTUP state,
  * because we don't clear it until we've successfully started WAL redo.
- * Similarly, RecoveryError means that we have crashed during recovery, and
- * should not try to restart.
  */
 typedef enum
 {
@@ -1246,6 +1254,7 @@ PostmasterMain(int argc, char *argv[])
         */
        StartupPID = StartupDataBase();
        Assert(StartupPID != 0);
+       StartupStatus = STARTUP_RUNNING;
        pmState = PM_STARTUP;
 
        /* Some workers may be scheduled to start now */
@@ -1666,7 +1675,7 @@ ServerLoop(void)
 
                /* If we have lost the archiver, try to start a new one. */
                if (PgArchPID == 0 && PgArchStartupAllowed())
-                               PgArchPID = pgarch_start();
+                       PgArchPID = pgarch_start();
 
                /* If we need to signal the autovacuum launcher, do so now */
                if (avlauncher_needs_signal)
@@ -2591,6 +2600,7 @@ reaper(SIGNAL_ARGS)
                        if (Shutdown > NoShutdown &&
                                (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
                        {
+                               StartupStatus = STARTUP_NOT_RUNNING;
                                pmState = PM_WAIT_BACKENDS;
                                /* PostmasterStateMachine logic does the rest */
                                continue;
@@ -2600,6 +2610,7 @@ reaper(SIGNAL_ARGS)
                        {
                                ereport(LOG,
                                                (errmsg("shutdown at recovery target")));
+                               StartupStatus = STARTUP_NOT_RUNNING;
                                Shutdown = SmartShutdown;
                                TerminateChildren(SIGTERM);
                                pmState = PM_WAIT_BACKENDS;
@@ -2624,16 +2635,18 @@ reaper(SIGNAL_ARGS)
                        /*
                         * After PM_STARTUP, any unexpected exit (including FATAL exit) of
                         * the startup process is catastrophic, so kill other children,
-                        * and set RecoveryError so we don't try to reinitialize after
-                        * they're gone.  Exception: if FatalError is already set, that
-                        * implies we previously sent the startup process a SIGQUIT, so
+                        * and set StartupStatus so we don't try to reinitialize after
+                        * they're gone.  Exception: if StartupStatus is STARTUP_SIGNALED,
+                        * then we previously sent the startup process a SIGQUIT; so
                         * that's probably the reason it died, and we do want to try to
                         * restart in that case.
                         */
                        if (!EXIT_STATUS_0(exitstatus))
                        {
-                               if (!FatalError)
-                                       RecoveryError = true;
+                               if (StartupStatus == STARTUP_SIGNALED)
+                                       StartupStatus = STARTUP_NOT_RUNNING;
+                               else
+                                       StartupStatus = STARTUP_CRASHED;
                                HandleChildCrash(pid, exitstatus,
                                                                 _("startup process"));
                                continue;
@@ -2642,6 +2655,7 @@ reaper(SIGNAL_ARGS)
                        /*
                         * Startup succeeded, commence normal operations
                         */
+                       StartupStatus = STARTUP_NOT_RUNNING;
                        FatalError = false;
                        Assert(AbortStartTime == 0);
                        ReachedNormalRunning = true;
@@ -2962,7 +2976,7 @@ CleanupBackgroundWorker(int pid,
                ReportBackgroundWorkerPID(rw);  /* report child death */
 
                LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG,
-                                                           namebuf, pid, exitstatus);
+                                        namebuf, pid, exitstatus);
 
                return true;
        }
@@ -3190,7 +3204,10 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 
        /* Take care of the startup process too */
        if (pid == StartupPID)
+       {
                StartupPID = 0;
+               StartupStatus = STARTUP_CRASHED;
+       }
        else if (StartupPID != 0 && take_action)
        {
                ereport(DEBUG2,
@@ -3198,6 +3215,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
                                                                 (SendStop ? "SIGSTOP" : "SIGQUIT"),
                                                                 (int) StartupPID)));
                signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
+               StartupStatus = STARTUP_SIGNALED;
        }
 
        /* Take care of the bgwriter too */
@@ -3589,13 +3607,14 @@ PostmasterStateMachine(void)
        }
 
        /*
-        * If recovery failed, or the user does not want an automatic restart
-        * after backend crashes, wait for all non-syslogger children to exit, and
-        * then exit postmaster. We don't try to reinitialize when recovery fails,
-        * because more than likely it will just fail again and we will keep
-        * trying forever.
+        * If the startup process failed, or the user does not want an automatic
+        * restart after backend crashes, wait for all non-syslogger children to
+        * exit, and then exit postmaster.  We don't try to reinitialize when the
+        * startup process fails, because more than likely it will just fail again
+        * and we will keep trying forever.
         */
-       if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash))
+       if (pmState == PM_NO_CHILDREN &&
+               (StartupStatus == STARTUP_CRASHED || !restart_after_crash))
                ExitPostmaster(1);
 
        /*
@@ -3615,6 +3634,7 @@ PostmasterStateMachine(void)
 
                StartupPID = StartupDataBase();
                Assert(StartupPID != 0);
+               StartupStatus = STARTUP_RUNNING;
                pmState = PM_STARTUP;
                /* crash recovery started, reset SIGKILL flag */
                AbortStartTime = 0;
@@ -3746,7 +3766,11 @@ TerminateChildren(int signal)
 {
        SignalChildren(signal);
        if (StartupPID != 0)
+       {
                signal_child(StartupPID, signal);
+               if (signal == SIGQUIT || signal == SIGKILL)
+                       StartupStatus = STARTUP_SIGNALED;
+       }
        if (BgWriterPID != 0)
                signal_child(BgWriterPID, signal);
        if (CheckpointerPID != 0)