From e6a442c71b30f62e7b5eee6058afc961b1c7f29b Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 11 Jan 2008 00:54:09 +0000 Subject: [PATCH] Restructure the shutdown procedure for the archiver process to allow it to finish archiving everything (when there's no error), and to eliminate various hazards as best we can. This fixes a previous 8.3 patch that caused the postmaster to kill and then restart the archiver during shutdown (!?). The new behavior is that the archiver is allowed to run unmolested until the bgwriter has exited; then it is sent SIGUSR2 to tell it to do a final archiving cycle and quit. We only SIGQUIT the archiver if we want a panic stop; this is important since SIGQUIT will also be sent to any active archive_command. The postmaster also now doesn't SIGQUIT the stats collector until the bgwriter is done, since the bgwriter can send stats messages in 8.3. The postmaster will not exit until both the archiver and stats collector are gone; this provides some defense (not too bulletproof) against conflicting archiver or stats collector processes being started by a new postmaster instance. We continue the prior practice that the archiver will check for postmaster death immediately before issuing any archive_command; that gives some additional protection against conflicting archivers. Also, modify the archiver process to notice SIGTERM and refuse to issue any more archive commands if it gets it. The postmaster doesn't ever send it SIGTERM; we assume that any such signal came from init and is a notice of impending whole-system shutdown. In this situation it seems imprudent to try to start new archive commands --- if they aren't extremely quick they're likely to get SIGKILL'd by init. All per discussion. --- src/backend/postmaster/pgarch.c | 91 +++++++++++++++++++++----- src/backend/postmaster/postmaster.c | 99 ++++++++++++++++++----------- 2 files changed, 138 insertions(+), 52 deletions(-) diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 6cb32fcb60..e181950c0f 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -19,7 +19,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.37 2008/01/01 19:45:51 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.38 2008/01/11 00:54:08 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -77,12 +77,15 @@ * ---------- */ static time_t last_pgarch_start_time; +static time_t last_sigterm_time = 0; /* * Flags set by interrupt handlers for later service in the main loop. */ static volatile sig_atomic_t got_SIGHUP = false; +static volatile sig_atomic_t got_SIGTERM = false; static volatile sig_atomic_t wakened = false; +static volatile sig_atomic_t ready_to_stop = false; /* ---------- * Local function forward declarations @@ -95,7 +98,9 @@ static pid_t pgarch_forkexec(void); NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]); static void pgarch_exit(SIGNAL_ARGS); static void ArchSigHupHandler(SIGNAL_ARGS); +static void ArchSigTermHandler(SIGNAL_ARGS); static void pgarch_waken(SIGNAL_ARGS); +static void pgarch_waken_stop(SIGNAL_ARGS); static void pgarch_MainLoop(void); static void pgarch_ArchiverCopyLoop(void); static bool pgarch_archiveXlog(char *xlog); @@ -236,16 +241,16 @@ PgArchiverMain(int argc, char *argv[]) /* * Ignore all signals usually bound to some action in the postmaster, - * except for SIGHUP, SIGUSR1 and SIGQUIT. + * except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT. */ pqsignal(SIGHUP, ArchSigHupHandler); pqsignal(SIGINT, SIG_IGN); - pqsignal(SIGTERM, SIG_IGN); + pqsignal(SIGTERM, ArchSigTermHandler); pqsignal(SIGQUIT, pgarch_exit); pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, pgarch_waken); - pqsignal(SIGUSR2, SIG_IGN); + pqsignal(SIGUSR2, pgarch_waken_stop); pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); @@ -267,28 +272,47 @@ PgArchiverMain(int argc, char *argv[]) static void pgarch_exit(SIGNAL_ARGS) { - /* - * For now, we just nail the doors shut and get out of town. It might - * seem cleaner to finish up any pending archive copies, but there's a - * nontrivial risk that init will kill us partway through. - */ - exit(0); + /* SIGQUIT means curl up and die ... */ + exit(1); } -/* SIGHUP: set flag to re-read config file at next convenient time */ +/* SIGHUP signal handler for archiver process */ static void ArchSigHupHandler(SIGNAL_ARGS) { + /* set flag to re-read config file at next convenient time */ got_SIGHUP = true; } +/* SIGTERM signal handler for archiver process */ +static void +ArchSigTermHandler(SIGNAL_ARGS) +{ + /* + * The postmaster never sends us SIGTERM, so we assume that this means + * that init is trying to shut down the whole system. If we hang around + * too long we'll get SIGKILL'd. Set flag to prevent starting any more + * archive commands. + */ + got_SIGTERM = true; +} + /* SIGUSR1 signal handler for archiver process */ static void pgarch_waken(SIGNAL_ARGS) { + /* set flag that there is work to be done */ wakened = true; } +/* SIGUSR2 signal handler for archiver process */ +static void +pgarch_waken_stop(SIGNAL_ARGS) +{ + /* set flag to do a final cycle and shut down afterwards */ + ready_to_stop = true; +} + /* * pgarch_MainLoop * @@ -298,6 +322,7 @@ static void pgarch_MainLoop(void) { time_t last_copy_time = 0; + bool time_to_stop; /* * We run the copy loop immediately upon entry, in case there are @@ -309,6 +334,9 @@ pgarch_MainLoop(void) do { + /* When we get SIGUSR2, we do one more archive cycle, then exit */ + time_to_stop = ready_to_stop; + /* Check for config update */ if (got_SIGHUP) { @@ -316,8 +344,26 @@ pgarch_MainLoop(void) ProcessConfigFile(PGC_SIGHUP); } + /* + * If we've gotten SIGTERM, we normally just sit and do nothing until + * SIGUSR2 arrives. However, that means a random SIGTERM would + * disable archiving indefinitely, which doesn't seem like a good + * idea. If more than 60 seconds pass since SIGTERM, exit anyway, + * so that the postmaster can start a new archiver if needed. + */ + if (got_SIGTERM) + { + time_t curtime = time(NULL); + + if (last_sigterm_time == 0) + last_sigterm_time = curtime; + else if ((unsigned int) (curtime - last_sigterm_time) >= + (unsigned int) 60) + break; + } + /* Do what we're here for */ - if (wakened) + if (wakened || time_to_stop) { wakened = false; pgarch_ArchiverCopyLoop(); @@ -334,7 +380,8 @@ pgarch_MainLoop(void) * sleep into 1-second increments, and check for interrupts after each * nap. */ - while (!(wakened || got_SIGHUP)) + while (!(wakened || ready_to_stop || got_SIGHUP || + !PostmasterIsAlive(true))) { time_t curtime; @@ -344,7 +391,13 @@ pgarch_MainLoop(void) (unsigned int) PGARCH_AUTOWAKE_INTERVAL) wakened = true; } - } while (PostmasterIsAlive(true)); + + /* + * The archiver quits either when the postmaster dies (not expected) + * or after completing one more archiving cycle after receiving + * SIGUSR2. + */ + } while (PostmasterIsAlive(true) && !time_to_stop); } /* @@ -377,8 +430,14 @@ pgarch_ArchiverCopyLoop(void) for (;;) { - /* Abandon processing if we notice our postmaster has died */ - if (!PostmasterIsAlive(true)) + /* + * Do not initiate any more archive commands after receiving + * SIGTERM, nor after the postmaster has died unexpectedly. + * The first condition is to try to keep from having init + * SIGKILL the command, and the second is to avoid conflicts + * with another archiver spawned by a newer postmaster. + */ + if (got_SIGTERM || !PostmasterIsAlive(true)) return; if (pgarch_archiveXlog(xlog)) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index ba6c9b9183..fe1ed795f9 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.550 2008/01/01 19:45:51 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.551 2008/01/11 00:54:09 tgl Exp $ * * NOTES * @@ -244,7 +244,7 @@ static bool FatalError = false; /* T if recovering from backend crash */ * Notice that this state variable does not distinguish *why* we entered * PM_WAIT_BACKENDS or later states --- Shutdown and FatalError must be * consulted to find that out. FatalError is never true in PM_RUN state, nor - * in PM_SHUTDOWN state (because we don't enter that state when trying to + * in PM_SHUTDOWN states (because we don't enter those states when trying to * recover from a crash). It can be true in PM_STARTUP state, because we * don't clear it until we've successfully recovered. */ @@ -255,6 +255,7 @@ typedef enum PM_RUN, /* normal "database is alive" state */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ PM_SHUTDOWN, /* waiting for bgwriter to do shutdown ckpt */ + PM_SHUTDOWN_2, /* waiting for archiver to finish */ PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */ PM_NO_CHILDREN /* all important children have exited */ } PMState; @@ -1312,12 +1313,8 @@ ServerLoop(void) start_autovac_launcher = false; /* signal processed */ } - /* - * If we have lost the archiver, try to start a new one. We do this - * even if we are shutting down, to allow archiver to take care of any - * remaining WAL files. - */ - if (XLogArchivingActive() && PgArchPID == 0 && pmState >= PM_RUN) + /* If we have lost the archiver, try to start a new one */ + if (XLogArchivingActive() && PgArchPID == 0 && pmState == PM_RUN) PgArchPID = pgarch_start(); /* If we have lost the stats collector, try to start a new one */ @@ -2175,12 +2172,31 @@ reaper(SIGNAL_ARGS) * checkpoint. (If for some reason it didn't, recovery will * occur on next postmaster start.) * - * At this point we should have no normal children left (else - * we'd not be in PM_SHUTDOWN state) but we might have - * dead_end children. + * At this point we should have no normal backend children + * left (else we'd not be in PM_SHUTDOWN state) but we might + * have dead_end children to wait for. + * + * If we have an archiver subprocess, tell it to do a last + * archive cycle and quit; otherwise we can go directly to + * PM_WAIT_DEAD_END state. */ Assert(Shutdown > NoShutdown); - pmState = PM_WAIT_DEAD_END; + + if (PgArchPID != 0) + { + /* Waken archiver for the last time */ + signal_child(PgArchPID, SIGUSR2); + pmState = PM_SHUTDOWN_2; + } + else + pmState = PM_WAIT_DEAD_END; + + /* + * We can also shut down the stats collector now; there's + * nothing left for it to do. + */ + if (PgStatPID != 0) + signal_child(PgStatPID, SIGQUIT); } else { @@ -2227,7 +2243,8 @@ reaper(SIGNAL_ARGS) /* * Was it the archiver? If so, just try to start a new one; no need * to force reset of the rest of the system. (If fail, we'll try - * again in future cycles of the main loop.) + * again in future cycles of the main loop.) But if we were waiting + * for it to shut down, advance to the next shutdown step. */ if (pid == PgArchPID) { @@ -2235,8 +2252,10 @@ reaper(SIGNAL_ARGS) if (!EXIT_STATUS_0(exitstatus)) LogChildExit(LOG, _("archiver process"), pid, exitstatus); - if (XLogArchivingActive() && pmState >= PM_RUN) + if (XLogArchivingActive() && pmState == PM_RUN) PgArchPID = pgarch_start(); + else if (pmState == PM_SHUTDOWN_2) + pmState = PM_WAIT_DEAD_END; continue; } @@ -2563,6 +2582,11 @@ PostmasterStateMachine(void) * change causes ServerLoop to stop creating new ones. */ pmState = PM_WAIT_DEAD_END; + + /* + * We already SIGQUIT'd the archiver and stats processes, + * if any, when we entered FatalError state. + */ } else { @@ -2591,13 +2615,13 @@ PostmasterStateMachine(void) */ FatalError = true; pmState = PM_WAIT_DEAD_END; + + /* Kill the archiver and stats collector too */ + if (PgArchPID != 0) + signal_child(PgArchPID, SIGQUIT); + if (PgStatPID != 0) + signal_child(PgStatPID, SIGQUIT); } - /* Tell pgarch to shut down too; nothing left for it to do */ - if (PgArchPID != 0) - signal_child(PgArchPID, SIGQUIT); - /* Tell pgstat to shut down too; nothing left for it to do */ - if (PgStatPID != 0) - signal_child(PgStatPID, SIGQUIT); } } } @@ -2606,16 +2630,26 @@ PostmasterStateMachine(void) { /* * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty - * (ie, no dead_end children remain). + * (ie, no dead_end children remain), and the archiver and stats + * collector are gone too. + * + * The reason we wait for those two is to protect them against a new + * postmaster starting conflicting subprocesses; this isn't an + * ironclad protection, but it at least helps in the + * shutdown-and-immediately-restart scenario. Note that they have + * already been sent appropriate shutdown signals, either during a + * normal state transition leading up to PM_WAIT_DEAD_END, or during + * FatalError processing. */ - if (!DLGetHead(BackendList)) + if (DLGetHead(BackendList) == NULL && + PgArchPID == 0 && PgStatPID == 0) { /* These other guys should be dead already */ Assert(StartupPID == 0); Assert(BgWriterPID == 0); Assert(WalWriterPID == 0); Assert(AutoVacPID == 0); - /* archiver, stats, and syslogger are not considered here */ + /* syslogger is not considered here */ pmState = PM_NO_CHILDREN; } } @@ -2628,14 +2662,9 @@ PostmasterStateMachine(void) * we got SIGTERM from init --- there may well not be time for recovery * before init decides to SIGKILL us.) * - * Note: we do not wait around for exit of the archiver or stats - * processes. They've been sent SIGQUIT by this point (either when we - * entered PM_SHUTDOWN state, or when we set FatalError, and at least one - * of those must have happened by now). In any case they contain logic to - * commit hara-kiri if they notice the postmaster is gone. Since they - * aren't connected to shared memory, they pose no problem for shutdown. - * The syslogger is not considered either, since it's intended to survive - * till the postmaster exits. + * Note that the syslogger continues to run. It will exit when it sees + * EOF on its input pipe, which happens when there are no more upstream + * processes. */ if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN) { @@ -2652,10 +2681,8 @@ PostmasterStateMachine(void) } /* - * If we need to recover from a crash, wait for all shmem-connected - * children to exit, then reset shmem and StartupDataBase. (We can ignore - * the archiver and stats processes here since they are not connected to - * shmem.) + * If we need to recover from a crash, wait for all non-syslogger + * children to exit, then reset shmem and StartupDataBase. */ if (FatalError && pmState == PM_NO_CHILDREN) { @@ -3782,7 +3809,7 @@ sigusr1_handler(SIGNAL_ARGS) } if (CheckPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER) && - PgArchPID != 0 && Shutdown <= SmartShutdown) + PgArchPID != 0) { /* * Send SIGUSR1 to archiver process, to wake it up and begin archiving -- 2.40.0