Make checkpoint requests more robust.

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 19 Mar 2019 16:49:27 +0000 (12:49 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 19 Mar 2019 16:49:27 +0000 (12:49 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 19 Mar 2019 16:49:27 +0000 (12:49 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 19 Mar 2019 16:49:27 +0000 (12:49 -0400)
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c

index 6ad2427973bd1b686aac40b0883619fe68e1b6df..2d0a704c9f16c8428984ffb51adf13f90eae2443 100644 (file)
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -150,7 +150,6 @@ double              CheckPointCompletionTarget = 0.5;
   * Flags set by interrupt handlers for later service in the main loop.
   */
  static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t checkpoint_requested = false;
  static volatile sig_atomic_t shutdown_requested = false;
  
  /*
@@ -382,12 +381,6 @@ CheckpointerMain(void)
                          */
                         UpdateSharedMemoryConfig();
                 }
-               if (checkpoint_requested)
-               {
-                       checkpoint_requested = false;
-                       do_checkpoint = true;
-                       BgWriterStats.m_requested_checkpoints++;
-               }
                 if (shutdown_requested)
                 {
                         /*
@@ -401,6 +394,17 @@ CheckpointerMain(void)
                         proc_exit(0);           /* done */
                 }
  
+               /*
+                * Detect a pending checkpoint request by checking whether the flags
+                * word in shared memory is nonzero.  We shouldn't need to acquire the
+                * ckpt_lck for this.
+                */
+               if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
+               {
+                       do_checkpoint = true;
+                       BgWriterStats.m_requested_checkpoints++;
+               }
+
                 /*
                  * Force a checkpoint if too much time has elapsed since the last one.
                  * Note that we count a timed checkpoint in stats only when this
@@ -645,17 +649,14 @@ CheckArchiveTimeout(void)
  static bool
  ImmediateCheckpointRequested(void)
  {
-       if (checkpoint_requested)
-       {
-               volatile CheckpointerShmemStruct *cps = CheckpointerShmem;
+       volatile CheckpointerShmemStruct *cps = CheckpointerShmem;
  
-               /*
-                * We don't need to acquire the ckpt_lck in this case because we're
-                * only looking at a single flag bit.
-                */
-               if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE)
-                       return true;
-       }
+       /*
+        * We don't need to acquire the ckpt_lck in this case because we're only
+        * looking at a single flag bit.
+        */
+       if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE)
+               return true;
         return false;
  }
  
@@ -857,7 +858,10 @@ ReqCheckpointHandler(SIGNAL_ARGS)
  {
         int                     save_errno = errno;
  
-       checkpoint_requested = true;
+       /*
+        * The signalling process should have set ckpt_flags nonzero, so all we
+        * need do is ensure that our main loop gets kicked out of any wait.
+        */
         SetLatch(MyLatch);
  
         errno = save_errno;
@@ -996,31 +1000,35 @@ RequestCheckpoint(int flags)
  
         old_failed = CheckpointerShmem->ckpt_failed;
         old_started = CheckpointerShmem->ckpt_started;
-       CheckpointerShmem->ckpt_flags |= flags;
+       CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED);
  
         SpinLockRelease(&CheckpointerShmem->ckpt_lck);
  
         /*
          * Send signal to request checkpoint.  It's possible that the checkpointer
          * hasn't started yet, or is in process of restarting, so we will retry a
-        * few times if needed.  Also, if not told to wait for the checkpoint to
-        * occur, we consider failure to send the signal to be nonfatal and merely
-        * LOG it.
+        * few times if needed.  (Actually, more than a few times, since on slow
+        * or overloaded buildfarm machines, it's been observed that the
+        * checkpointer can take several seconds to start.)  However, if not told
+        * to wait for the checkpoint to occur, we consider failure to send the
+        * signal to be nonfatal and merely LOG it.  The checkpointer should see
+        * the request when it does start, with or without getting a signal.
          */
+#define MAX_SIGNAL_TRIES 600   /* max wait 60.0 sec */
         for (ntries = 0;; ntries++)
         {
                 if (CheckpointerShmem->checkpointer_pid == 0)
                 {
-                       if (ntries >= 20)       /* max wait 2.0 sec */
+                       if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT))
                         {
                                 elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
-                                        "could not request checkpoint because checkpointer not running");
+                                        "could not signal for checkpoint: checkpointer is not running");
                                 break;
                         }
                 }
                 else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0)
                 {
-                       if (ntries >= 20)       /* max wait 2.0 sec */
+                       if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT))
                         {
                                 elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
                                          "could not signal for checkpoint: %m");
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h

index 66bfb77295b17ade31a4b655a61a5caf4c23e283..8751d86cbd8eef61f42342476d316fc7e72bd3e1 100644 (file)
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -184,6 +184,8 @@ extern bool XLOG_DEBUG;
  /* These indicate the cause of a checkpoint request */
  #define CHECKPOINT_CAUSE_XLOG  0x0040  /* XLOG consumption */
  #define CHECKPOINT_CAUSE_TIME  0x0080  /* Elapsed time */
+/* We set this to ensure that ckpt_flags is not 0 if a request has been made */
+#define CHECKPOINT_REQUESTED   0x0100  /* Checkpoint request has been made */
  
  /*
   * Flag bits for the record being inserted, set using XLogSetRecordFlags().
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 19 Mar 2019 16:49:27 +0000 (12:49 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 19 Mar 2019 16:49:27 +0000 (12:49 -0400)
src/backend/postmaster/checkpointer.c		patch \| blob \| history
src/include/access/xlog.h		patch \| blob \| history