]> granicus.if.org Git - postgresql/commitdiff
Prevent freshly-started backend from ignoring SIGUSR1, per race condition
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 20 Dec 2000 21:51:52 +0000 (21:51 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 20 Dec 2000 21:51:52 +0000 (21:51 +0000)
observed by Inoue.  Also, don't call ProcRemove() from postmaster if we
have detected a backend crash --- too risky if shared memory is corrupted.
It's not needed anyway, considering we are going to reinitialize shared
memory and semaphores as soon as the last child is dead.

src/backend/postmaster/postmaster.c
src/backend/tcop/postgres.c

index e1c8021ff0c61b1fba92273b72fe391b881a9eb4..ef2896b3c2c506ebf8d82f9649c3e30c3e579e30 100644 (file)
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/postmaster/postmaster.c,v 1.200 2000/12/18 18:45:04 momjian Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/postmaster/postmaster.c,v 1.201 2000/12/20 21:51:52 tgl Exp $
  *
  * NOTES
  *
@@ -180,7 +180,7 @@ static time_t       checkpointed = 0;
 
 static int     Shutdown = NoShutdown;
 
-static bool FatalError = false;
+static bool FatalError = false;        /* T if recovering from backend crash */
 
 /*
  * State for assigning random salts and cancel keys.
@@ -649,7 +649,7 @@ PostmasterMain(int argc, char *argv[])
        pqsignal(SIGTERM, pmdie);       /* wait for children and ShutdownDataBase */
        pqsignal(SIGALRM, SIG_IGN); /* ignored */
        pqsignal(SIGPIPE, SIG_IGN); /* ignored */
-       pqsignal(SIGUSR1, SIG_IGN); /* ignored */
+       pqsignal(SIGUSR1, pmdie);       /* currently ignored, but see note in pmdie */
        pqsignal(SIGUSR2, pmdie);       /* send SIGUSR2, don't die */
        pqsignal(SIGCHLD, reaper);      /* handle child termination */
        pqsignal(SIGTTIN, SIG_IGN); /* ignored */
@@ -1329,6 +1329,18 @@ pmdie(SIGNAL_ARGS)
 
        switch (postgres_signal_arg)
        {
+               case SIGUSR1:
+                       /*
+                        * Currently the postmaster ignores SIGUSR1 (maybe it should
+                        * do something useful instead?)  But we must have some handler
+                        * installed for SIGUSR1, not just set it to SIG_IGN.  Else, a
+                        * freshly spawned backend would likewise have it set to SIG_IGN,
+                        * which would mean the backend would ignore any attempt to kill
+                        * it before it had gotten as far as setting up its own handler.
+                        */
+                       errno = save_errno;
+                       return;
+
                case SIGUSR2:
 
                        /*
@@ -1511,7 +1523,7 @@ reaper(SIGNAL_ARGS)
                                ExitPostmaster(1);
                        }
                        StartupPID = 0;
-                       FatalError = false;
+                       FatalError = false;     /* done with recovery */
                        if (Shutdown > NoShutdown)
                        {
                                if (ShutdownPID > 0)
@@ -1539,12 +1551,7 @@ reaper(SIGNAL_ARGS)
                /*
                 * Wait for all children exit, then reset shmem and StartupDataBase.
                 */
-               if (DLGetHead(BackendList))
-               {
-                       errno = save_errno;
-                       return;
-               }
-               if (StartupPID > 0 || ShutdownPID > 0)
+               if (DLGetHead(BackendList) || StartupPID > 0 || ShutdownPID > 0)
                {
                        errno = save_errno;
                        return;
@@ -1595,13 +1602,10 @@ CleanupProc(int pid,
        Dlelem     *curr,
                           *next;
        Backend    *bp;
-       int                     sig;
 
        if (DebugLvl)
-       {
                fprintf(stderr, "%s: CleanupProc: pid %d exited with status %d\n",
                                progname, pid, exitstatus);
-       }
 
        /*
         * If a backend dies in an ugly way (i.e. exit status not 0) then we
@@ -1609,7 +1613,7 @@ CleanupProc(int pid,
         * we assume everything is hunky dory and simply remove the backend
         * from the active backend list.
         */
-       if (!exitstatus)
+       if (exitstatus == 0)
        {
                curr = DLGetHead(BackendList);
                while (curr)
@@ -1628,66 +1632,64 @@ CleanupProc(int pid,
                if (pid == CheckPointPID)
                {
                        CheckPointPID = 0;
-                       checkpointed = time(NULL);
+                       if (!FatalError)
+                               checkpointed = time(NULL);
                }
                else
-                       ProcRemove(pid);
+               {
+                       /* Why is this done here, and not by the backend itself? */
+                       if (!FatalError)
+                               ProcRemove(pid);
+               }
 
                return;
        }
 
        if (!FatalError)
        {
+               /* Make log entry unless we did so already */
                tnow = time(NULL);
                fprintf(stderr, "Server process (pid %d) exited with status %d at %s"
                                "Terminating any active server processes...\n",
                                pid, exitstatus, ctime(&tnow));
                fflush(stderr);
        }
-       FatalError = true;
+
        curr = DLGetHead(BackendList);
        while (curr)
        {
                next = DLGetSucc(curr);
                bp = (Backend *) DLE_VAL(curr);
-
-               /*
-                * SIGUSR1 is the special signal that says exit without proc_exit
-                * and let the user know what's going on. ProcSemaphoreKill()
-                * cleans up the backends semaphore.  If SendStop is set (-s on
-                * command line), then we send a SIGSTOP so that we can core dumps
-                * from all backends by hand.
-                */
-               sig = (SendStop) ? SIGSTOP : SIGUSR1;
                if (bp->pid != pid)
                {
-                       if (DebugLvl)
-                               fprintf(stderr, "%s: CleanupProc: sending %s to process %d\n",
-                                               progname,
-                                               (sig == SIGUSR1)
-                                               ? "SIGUSR1" : "SIGSTOP",
-                                               bp->pid);
-                       kill(bp->pid, sig);
+                       /*
+                        * This backend is still alive.  Unless we did so already,
+                        * tell it to commit hara-kiri.
+                        *
+                        * SIGUSR1 is the special signal that says exit without proc_exit
+                        * and let the user know what's going on. But if SendStop is set
+                        * (-s on command line), then we send SIGSTOP instead, so that we
+                        * can get core dumps from all backends by hand.
+                        */
+                       if (!FatalError)
+                       {
+                               if (DebugLvl)
+                                       fprintf(stderr, "%s: CleanupProc: sending %s to process %d\n",
+                                                       progname,
+                                                       (SendStop ? "SIGSTOP" : "SIGUSR1"),
+                                                       bp->pid);
+                               kill(bp->pid, (SendStop ? SIGSTOP : SIGUSR1));
+                       }
                }
                else
                {
-
                        /*
-                        * I don't like that we call ProcRemove() here, assuming that
-                        * shmem may be corrupted! But is there another way to free
-                        * backend semaphores? Actually, I believe that we need not in
-                        * per backend semaphore at all (we use them to wait on lock
-                        * only, couldn't we just sigpause?), so probably we'll remove
-                        * this call from here someday.  -- vadim 04-10-1999
+                        * Found entry for freshly-dead backend, so remove it.
+                        *
+                        * Don't call ProcRemove() here, since shmem may be corrupted!
+                        * We are going to reinitialize shmem and semaphores anyway
+                        * once all the children are dead, so no need for it.
                         */
-                       if (pid == CheckPointPID)
-                       {
-                               CheckPointPID = 0;
-                               checkpointed = 0;
-                       }
-                       else
-                               ProcRemove(pid);
-
                        DLRemove(curr);
                        free(bp);
                        DLFreeElem(curr);
@@ -1695,6 +1697,13 @@ CleanupProc(int pid,
                curr = next;
        }
 
+       if (pid == CheckPointPID)
+       {
+               CheckPointPID = 0;
+               checkpointed = 0;
+       }
+
+       FatalError = true;
 }
 
 /*
index 2946c9a5ef4150e67523c431bdb4ef7c92c7741a..a8b7b4ec1bc2cfe87c9d50c7bdcea4f747877fd4 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/tcop/postgres.c,v 1.197 2000/12/18 18:45:05 momjian Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/tcop/postgres.c,v 1.198 2000/12/20 21:51:52 tgl Exp $
  *
  * NOTES
  *       this is the "main" module of the postgres backend and
@@ -1462,21 +1462,12 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[], const cha
        Assert(DataDir);
 
        /*
-        * 1. Set BlockSig and UnBlockSig masks. 2. Set up signal handlers. 3.
-        * Allow only SIGUSR1 signal (we never block it) during
-        * initialization.
+        * Set up signal handlers and masks.
         *
-        * Note that postmaster already blocked ALL signals to make us happy.
+        * Note that postmaster blocked all signals before forking child process,
+        * so there is no race condition whereby we might receive a signal before
+        * we have set up the handler.
         */
-       pqinitmask();
-
-#ifdef HAVE_SIGPROCMASK
-       sigdelset(&BlockSig, SIGUSR1);
-#else
-       BlockSig &= ~(sigmask(SIGUSR1));
-#endif
-
-       PG_SETMASK(&BlockSig);          /* block everything except SIGUSR1 */
 
        pqsignal(SIGHUP, SigHupHandler);        /* set flag to read config file */
        pqsignal(SIGINT, QueryCancelHandler);           /* cancel current query */
@@ -1499,6 +1490,17 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[], const cha
        pqsignal(SIGTTOU, SIG_DFL);
        pqsignal(SIGCONT, SIG_DFL);
 
+       pqinitmask();
+
+       /* We allow SIGUSR1 (quickdie) at all times */
+#ifdef HAVE_SIGPROCMASK
+       sigdelset(&BlockSig, SIGUSR1);
+#else
+       BlockSig &= ~(sigmask(SIGUSR1));
+#endif
+
+       PG_SETMASK(&BlockSig);          /* block everything except SIGUSR1 */
+
 
        if (IsUnderPostmaster)
        {
@@ -1649,7 +1651,7 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[], const cha
        if (!IsUnderPostmaster)
        {
                puts("\nPOSTGRES backend interactive interface ");
-               puts("$Revision: 1.197 $ $Date: 2000/12/18 18:45:05 $\n");
+               puts("$Revision: 1.198 $ $Date: 2000/12/20 21:51:52 $\n");
        }
 
        /*