Replace checkpoint_segments with min_wal_size and max_wal_size.

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Mon, 23 Feb 2015 16:53:02 +0000 (18:53 +0200)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Mon, 23 Feb 2015 16:53:02 +0000 (18:53 +0200)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 23 Feb 2015 16:53:02 +0000 (18:53 +0200)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 23 Feb 2015 16:53:02 +0000 (18:53 +0200)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index a3917aac7855bc3c71f7e6bb6b3c78078d981314..5ada5c8a1c2d092b9881cfee9925f4c1750f3a39 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1325,7 +1325,7 @@ include_dir 'conf.d'
          40% of RAM to <varname>shared_buffers</varname> will work better than a
          smaller amount.  Larger settings for <varname>shared_buffers</varname>
          usually require a corresponding increase in
-        <varname>checkpoint_segments</varname>, in order to spread out the
+        <varname>max_wal_size</varname>, in order to spread out the
          process of writing large quantities of new or changed data over a
          longer period of time.
         </para>
@@ -2394,18 +2394,20 @@ include_dir 'conf.d'
       <title>Checkpoints</title>
  
      <variablelist>
-     <varlistentry id="guc-checkpoint-segments" xreflabel="checkpoint_segments">
-      <term><varname>checkpoint_segments</varname> (<type>integer</type>)
+     <varlistentry id="guc-max-wal-size" xreflabel="max_wal_size">
+      <term><varname>max_wal_size</varname> (<type>integer</type>)</term>
        <indexterm>
-       <primary><varname>checkpoint_segments</> configuration parameter</primary>
+       <primary><varname>max_wal_size</> configuration parameter</primary>
        </indexterm>
-      </term>
        <listitem>
         <para>
-        Maximum number of log file segments between automatic WAL
-        checkpoints (each segment is normally 16 megabytes). The default
-        is three segments.  Increasing this parameter can increase the
-        amount of time needed for crash recovery.
+        Maximum size to let the WAL grow to between automatic WAL
+        checkpoints. This is a soft limit; WAL size can exceed
+        <varname>max_wal_size</> under special circumstances, like
+        under heavy load, a failing <varname>archive_command</>, or a high
+        <varname>wal_keep_segments</> setting. The default is 128 MB.
+        Increasing this parameter can increase the amount of time needed for
+        crash recovery.
          This parameter can only be set in the <filename>postgresql.conf</>
          file or on the server command line.
         </para>
@@ -2458,7 +2460,7 @@ include_dir 'conf.d'
          Write a message to the server log if checkpoints caused by
          the filling of checkpoint segment files happen closer together
          than this many seconds (which suggests that
-        <varname>checkpoint_segments</> ought to be raised).  The default is
+        <varname>max_wal_size</> ought to be raised).  The default is
          30 seconds (<literal>30s</>).  Zero disables the warning.
          No warnings will be generated if <varname>checkpoint_timeout</varname>
          is less than <varname>checkpoint_warning</varname>.
@@ -2468,6 +2470,24 @@ include_dir 'conf.d'
        </listitem>
       </varlistentry>
  
+     <varlistentry id="guc-min-wal-size" xreflabel="min_wal_size">
+      <term><varname>min_wal_size</varname> (<type>integer</type>)</term>
+      <indexterm>
+       <primary><varname>min_wal_size</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        As long as WAL disk usage stays below this setting, old WAL files are
+        always recycled for future use at a checkpoint, rather than removed.
+        This can be used to ensure that enough WAL space is reserved to
+        handle spikes in WAL usage, for example when running large batch
+        jobs. The default is 80 MB.
+        This parameter can only be set in the <filename>postgresql.conf</>
+        file or on the server command line.
+       </para>
+      </listitem>
+     </varlistentry>
+
       </variablelist>
       </sect2>
       <sect2 id="runtime-config-wal-archiving">
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml

index 5a087fbe6a098f8bfb2ac55c99cf1e767a486409..c73580ed460c77806203dffd3ad7fb22b66597f7 100644 (file)
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1328,19 +1328,19 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
     </para>
    </sect2>
  
-  <sect2 id="populate-checkpoint-segments">
-   <title>Increase <varname>checkpoint_segments</varname></title>
+  <sect2 id="populate-max-wal-size">
+   <title>Increase <varname>max_wal_size</varname></title>
  
     <para>
-    Temporarily increasing the <xref
-    linkend="guc-checkpoint-segments"> configuration variable can also
+    Temporarily increasing the <xref linkend="guc-max-wal-size">
+    configuration variable can also
      make large data loads faster.  This is because loading a large
      amount of data into <productname>PostgreSQL</productname> will
      cause checkpoints to occur more often than the normal checkpoint
      frequency (specified by the <varname>checkpoint_timeout</varname>
      configuration variable). Whenever a checkpoint occurs, all dirty
      pages must be flushed to disk. By increasing
-    <varname>checkpoint_segments</varname> temporarily during bulk
+    <varname>max_wal_size</varname> temporarily during bulk
      data loads, the number of checkpoints that are required can be
      reduced.
     </para>
@@ -1445,7 +1445,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
        <para>
         Set appropriate (i.e., larger than normal) values for
         <varname>maintenance_work_mem</varname> and
-       <varname>checkpoint_segments</varname>.
+       <varname>max_wal_size</varname>.
        </para>
       </listitem>
       <listitem>
@@ -1512,7 +1512,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
  
      So when loading a data-only dump, it is up to you to drop and recreate
      indexes and foreign keys if you wish to use those techniques.
-    It's still useful to increase <varname>checkpoint_segments</varname>
+    It's still useful to increase <varname>max_wal_size</varname>
      while loading the data, but don't bother increasing
      <varname>maintenance_work_mem</varname>; rather, you'd do that while
      manually recreating indexes and foreign keys afterwards.
@@ -1577,7 +1577,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
  
       <listitem>
        <para>
-       Increase <xref linkend="guc-checkpoint-segments"> and <xref
+       Increase <xref linkend="guc-max-wal-size"> and <xref
         linkend="guc-checkpoint-timeout"> ; this reduces the frequency
         of checkpoints, but increases the storage requirements of
         <filename>/pg_xlog</>.
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml

index 1254c03f80e3315198803575ea768f6eb413d8e0..b57749fdbc3f79947cf50b0834c42efdc7c41e91 100644 (file)
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -472,9 +472,10 @@
    <para>
     The server's checkpointer process automatically performs
     a checkpoint every so often.  A checkpoint is begun every <xref
-   linkend="guc-checkpoint-segments"> log segments, or every <xref
-   linkend="guc-checkpoint-timeout"> seconds, whichever comes first.
-   The default settings are 3 segments and 300 seconds (5 minutes), respectively.
+   linkend="guc-checkpoint-timeout"> seconds, or if
+   <xref linkend="guc-max-wal-size"> is about to be exceeded,
+   whichever comes first.
+   The default settings are 5 minutes and 128 MB, respectively.
     If no WAL has been written since the previous checkpoint, new checkpoints
     will be skipped even if <varname>checkpoint_timeout</> has passed.
     (If WAL archiving is being used and you want to put a lower limit on how
@@ -486,8 +487,8 @@
    </para>
  
    <para>
-   Reducing <varname>checkpoint_segments</varname> and/or
-   <varname>checkpoint_timeout</varname> causes checkpoints to occur
+   Reducing <varname>checkpoint_timeout</varname> and/or
+   <varname>max_wal_size</varname> causes checkpoints to occur
     more often. This allows faster after-crash recovery, since less work
     will need to be redone. However, one must balance this against the
     increased cost of flushing dirty data pages more often. If
@@ -510,11 +511,11 @@
     parameter.  If checkpoints happen closer together than
     <varname>checkpoint_warning</> seconds,
     a message will be output to the server log recommending increasing
-   <varname>checkpoint_segments</varname>.  Occasional appearance of such
+   <varname>max_wal_size</varname>.  Occasional appearance of such
     a message is not cause for alarm, but if it appears often then the
     checkpoint control parameters should be increased. Bulk operations such
     as large <command>COPY</> transfers might cause a number of such warnings
-   to appear if you have not set <varname>checkpoint_segments</> high
+   to appear if you have not set <varname>max_wal_size</> high
     enough.
    </para>
  
@@ -525,10 +526,10 @@
     <xref linkend="guc-checkpoint-completion-target">, which is
     given as a fraction of the checkpoint interval.
     The I/O rate is adjusted so that the checkpoint finishes when the
-   given fraction of <varname>checkpoint_segments</varname> WAL segments
-   have been consumed since checkpoint start, or the given fraction of
-   <varname>checkpoint_timeout</varname> seconds have elapsed,
-   whichever is sooner.  With the default value of 0.5,
+   given fraction of
+   <varname>checkpoint_timeout</varname> seconds have elapsed, or before
+   <varname>max_wal_size</varname> is exceeded, whichever is sooner.
+   With the default value of 0.5,
     <productname>PostgreSQL</> can be expected to complete each checkpoint
     in about half the time before the next checkpoint starts.  On a system
     that's very close to maximum I/O throughput during normal operation,
@@ -545,18 +546,35 @@
    </para>
  
    <para>
-   There will always be at least one WAL segment file, and will normally
-   not be more than (2 + <varname>checkpoint_completion_target</varname>) * <varname>checkpoint_segments</varname> + 1
-   or <varname>checkpoint_segments</> + <xref linkend="guc-wal-keep-segments"> + 1
-   files.  Each segment file is normally 16 MB (though this size can be
-   altered when building the server).  You can use this to estimate space
-   requirements for <acronym>WAL</acronym>.
-   Ordinarily, when old log segment files are no longer needed, they
-   are recycled (that is, renamed to become future segments in the numbered
-   sequence). If, due to a short-term peak of log output rate, there
-   are more than 3 * <varname>checkpoint_segments</varname> + 1
-   segment files, the unneeded segment files will be deleted instead
-   of recycled until the system gets back under this limit.
+   The number of WAL segment files in <filename>pg_xlog</> directory depends on
+   <varname>min_wal_size</>, <varname>max_wal_size</> and
+   the amount of WAL generated in previous checkpoint cycles. When old log
+   segment files are no longer needed, they are removed or recycled (that is,
+   renamed to become future segments in the numbered sequence). If, due to a
+   short-term peak of log output rate, <varname>max_wal_size</> is
+   exceeded, the unneeded segment files will be removed until the system
+   gets back under this limit. Below that limit, the system recycles enough
+   WAL files to cover the estimated need until the next checkpoint, and
+   removes the rest. The estimate is based on a moving average of the number
+   of WAL files used in previous checkpoint cycles. The moving average
+   is increased immediately if the actual usage exceeds the estimate, so it
+   accommodates peak usage rather average usage to some extent.
+   <varname>min_wal_size</> puts a minimum on the amount of WAL files
+   recycled for future usage; that much WAL is always recycled for future use,
+   even if the system is idle and the WAL usage estimate suggests that little
+   WAL is needed.
+  </para>
+
+  <para>
+   Independently of <varname>max_wal_size</varname>,
+   <xref linkend="guc-wal-keep-segments"> + 1 most recent WAL files are
+   kept at all times. Also, if WAL archiving is used, old segments can not be
+   removed or recycled until they are archived. If WAL archiving cannot keep up
+   with the pace that WAL is generated, or if <varname>archive_command</varname>
+   fails repeatedly, old WAL files will accumulate in <filename>pg_xlog</>
+   until the situation is resolved. A slow or failed standby server that
+   uses a replication slot will have the same effect (see
+   <xref linkend="streaming-replication-slots">).
    </para>
  
    <para>
@@ -571,9 +589,8 @@
     master because restartpoints can only be performed at checkpoint records.
     A restartpoint is triggered when a checkpoint record is reached if at
     least <varname>checkpoint_timeout</> seconds have passed since the last
-   restartpoint. In standby mode, a restartpoint is also triggered if at
-   least <varname>checkpoint_segments</> log segments have been replayed
-   since the last restartpoint.
+   restartpoint, or if WAL size is about to exceed
+   <varname>max_wal_size</>.
    </para>
  
    <para>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index f68f82b255c0608d7a7f958865a2abe3d179911c..a28155f977d67bac2fe8fe05461c730ea93cbca1 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -79,7 +79,8 @@ extern uint32 bootstrap_data_checksum_version;
  
  
  /* User-settable parameters */
-int                    CheckPointSegments = 3;
+int                    max_wal_size = 8;               /* 128 MB */
+int                    min_wal_size = 5;               /* 80 MB */
  int                    wal_keep_segments = 0;
  int                    XLOGbuffers = -1;
  int                    XLogArchiveTimeout = 0;
@@ -107,18 +108,14 @@ bool              XLOG_DEBUG = false;
  #define NUM_XLOGINSERT_LOCKS  8
  
  /*
- * XLOGfileslop is the maximum number of preallocated future XLOG segments.
- * When we are done with an old XLOG segment file, we will recycle it as a
- * future XLOG segment as long as there aren't already XLOGfileslop future
- * segments; else we'll delete it.  This could be made a separate GUC
- * variable, but at present I think it's sufficient to hardwire it as
- * 2*CheckPointSegments+1.  Under normal conditions, a checkpoint will free
- * no more than 2*CheckPointSegments log segments, and we want to recycle all
- * of them; the +1 allows boundary cases to happen without wasting a
- * delete/create-segment cycle.
+ * Max distance from last checkpoint, before triggering a new xlog-based
+ * checkpoint.
   */
-#define XLOGfileslop   (2*CheckPointSegments + 1)
+int                    CheckPointSegments;
  
+/* Estimated distance between checkpoints, in bytes */
+static double CheckPointDistanceEstimate = 0;
+static double PrevCheckPointDistance = 0;
  
  /*
   * GUC support
@@ -779,7 +776,7 @@ static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
  static bool XLogCheckpointNeeded(XLogSegNo new_segno);
  static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
  static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
-                                          bool find_free, int *max_advance,
+                                          bool find_free, XLogSegNo max_segno,
                                            bool use_lock);
  static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
                          int source, bool notexistOk);
@@ -792,7 +789,7 @@ static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
  static int     emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
  static void XLogFileClose(void);
  static void PreallocXlogFiles(XLogRecPtr endptr);
-static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
+static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
  static void UpdateLastRemovedPtr(char *filename);
  static void ValidateXLOGDirectoryStructure(void);
  static void CleanupBackupHistory(void);
@@ -1958,6 +1955,104 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
  #endif
  }
  
+/*
+ * Calculate CheckPointSegments based on max_wal_size and
+ * checkpoint_completion_target.
+ */
+static void
+CalculateCheckpointSegments(void)
+{
+       double          target;
+
+       /*-------
+        * Calculate the distance at which to trigger a checkpoint, to avoid
+        * exceeding max_wal_size. This is based on two assumptions:
+        *
+        * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
+        * b) during checkpoint, we consume checkpoint_completion_target *
+        *    number of segments consumed between checkpoints.
+        *-------
+        */
+       target = (double ) max_wal_size / (2.0 + CheckPointCompletionTarget);
+
+       /* round down */
+       CheckPointSegments = (int) target;
+
+       if (CheckPointSegments < 1)
+               CheckPointSegments = 1;
+}
+
+void
+assign_max_wal_size(int newval, void *extra)
+{
+       max_wal_size = newval;
+       CalculateCheckpointSegments();
+}
+
+void
+assign_checkpoint_completion_target(double newval, void *extra)
+{
+       CheckPointCompletionTarget = newval;
+       CalculateCheckpointSegments();
+}
+
+/*
+ * At a checkpoint, how many WAL segments to recycle as preallocated future
+ * XLOG segments? Returns the highest segment that should be preallocated.
+ */
+static XLogSegNo
+XLOGfileslop(XLogRecPtr PriorRedoPtr)
+{
+       XLogSegNo       minSegNo;
+       XLogSegNo       maxSegNo;
+       double          distance;
+       XLogSegNo       recycleSegNo;
+
+       /*
+        * Calculate the segment numbers that min_wal_size and max_wal_size
+        * correspond to. Always recycle enough segments to meet the minimum, and
+        * remove enough segments to stay below the maximum.
+        */
+       minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + min_wal_size - 1;
+       maxSegNo =  PriorRedoPtr / XLOG_SEG_SIZE + max_wal_size - 1;
+
+       /*
+        * Between those limits, recycle enough segments to get us through to the
+        * estimated end of next checkpoint.
+        *
+        * To estimate where the next checkpoint will finish, assume that the
+        * system runs steadily consuming CheckPointDistanceEstimate
+        * bytes between every checkpoint.
+        *
+        * The reason this calculation is done from the prior checkpoint, not the
+        * one that just finished, is that this behaves better if some checkpoint
+        * cycles are abnormally short, like if you perform a manual checkpoint
+        * right after a timed one. The manual checkpoint will make almost a full
+        * cycle's worth of WAL segments available for recycling, because the
+        * segments from the prior's prior, fully-sized checkpoint cycle are no
+        * longer needed. However, the next checkpoint will make only few segments
+        * available for recycling, the ones generated between the timed
+        * checkpoint and the manual one right after that. If at the manual
+        * checkpoint we only retained enough segments to get us to the next timed
+        * one, and removed the rest, then at the next checkpoint we would not
+        * have enough segments around for recycling, to get us to the checkpoint
+        * after that. Basing the calculations on the distance from the prior redo
+        * pointer largely fixes that problem.
+        */
+       distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
+       /* add 10% for good measure. */
+       distance *= 1.10;
+
+       recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE);
+
+       if (recycleSegNo < minSegNo)
+               recycleSegNo = minSegNo;
+       if (recycleSegNo > maxSegNo)
+               recycleSegNo = maxSegNo;
+
+       return recycleSegNo;
+}
+
  /*
   * Check whether we've consumed enough xlog space that a checkpoint is needed.
   *
@@ -2765,7 +2860,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
         char            zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
         char       *zbuffer;
         XLogSegNo       installed_segno;
-       int                     max_advance;
+       XLogSegNo       max_segno;
         int                     fd;
         int                     nbytes;
  
@@ -2868,9 +2963,19 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
          * pre-create a future log segment.
          */
         installed_segno = logsegno;
-       max_advance = XLOGfileslop;
+
+       /*
+        * XXX: What should we use as max_segno? We used to use XLOGfileslop when
+        * that was a constant, but that was always a bit dubious: normally, at a
+        * checkpoint, XLOGfileslop was the offset from the checkpoint record,
+        * but here, it was the offset from the insert location. We can't do the
+        * normal XLOGfileslop calculation here because we don't have access to
+        * the prior checkpoint's redo location. So somewhat arbitrarily, just
+        * use CheckPointSegments.
+        */
+       max_segno = logsegno + CheckPointSegments;
         if (!InstallXLogFileSegment(&installed_segno, tmppath,
-                                                               *use_existent, &max_advance,
+                                                               *use_existent, max_segno,
                                                                 use_lock))
         {
                 /*
@@ -3011,7 +3116,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
         /*
          * Now move the segment into place with its final name.
          */
-       if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
+       if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
                 elog(ERROR, "InstallXLogFileSegment should not have failed");
  }
  
@@ -3031,22 +3136,21 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
   * number at or after the passed numbers.  If FALSE, install the new segment
   * exactly where specified, deleting any existing segment file there.
   *
- * *max_advance: maximum number of segno slots to advance past the starting
- * point.  Fail if no free slot is found in this range.  On return, reduced
- * by the number of slots skipped over.  (Irrelevant, and may be NULL,
- * when find_free is FALSE.)
+ * max_segno: maximum segment number to install the new file as.  Fail if no
+ * free slot is found between *segno and max_segno. (Ignored when find_free
+ * is FALSE.)
   *
   * use_lock: if TRUE, acquire ControlFileLock while moving file into
   * place.  This should be TRUE except during bootstrap log creation.  The
   * caller must *not* hold the lock at call.
   *
   * Returns TRUE if the file was installed successfully.  FALSE indicates that
- * max_advance limit was exceeded, or an error occurred while renaming the
+ * max_segno limit was exceeded, or an error occurred while renaming the
   * file into place.
   */
  static bool
  InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
-                                          bool find_free, int *max_advance,
+                                          bool find_free, XLogSegNo max_segno,
                                            bool use_lock)
  {
         char            path[MAXPGPATH];
@@ -3070,7 +3174,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
                 /* Find a free slot to put it in */
                 while (stat(path, &stat_buf) == 0)
                 {
-                       if (*max_advance <= 0)
+                       if ((*segno) >= max_segno)
                         {
                                 /* Failed to find a free slot within specified range */
                                 if (use_lock)
@@ -3078,7 +3182,6 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
                                 return false;
                         }
                         (*segno)++;
-                       (*max_advance)--;
                         XLogFilePath(path, ThisTimeLineID, *segno);
                 }
         }
@@ -3426,14 +3529,15 @@ UpdateLastRemovedPtr(char *filename)
  /*
   * Recycle or remove all log files older or equal to passed segno
   *
- * endptr is current (or recent) end of xlog; this is used to determine
+ * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
+ * redo pointer of the previous checkpoint. These are used to determine
   * whether we want to recycle rather than delete no-longer-wanted log files.
   */
  static void
-RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
+RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
  {
         XLogSegNo       endlogSegNo;
-       int                     max_advance;
+       XLogSegNo       recycleSegNo;
         DIR                *xldir;
         struct dirent *xlde;
         char            lastoff[MAXFNAMELEN];
@@ -3445,11 +3549,10 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
         struct stat statbuf;
  
         /*
-        * Initialize info about where to try to recycle to.  We allow recycling
-        * segments up to XLOGfileslop segments beyond the current XLOG location.
+        * Initialize info about where to try to recycle to.
          */
         XLByteToPrevSeg(endptr, endlogSegNo);
-       max_advance = XLOGfileslop;
+       recycleSegNo = XLOGfileslop(PriorRedoPtr);
  
         xldir = AllocateDir(XLOGDIR);
         if (xldir == NULL)
@@ -3498,20 +3601,17 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
                                  * for example can create symbolic links pointing to a
                                  * separate archive directory.
                                  */
-                               if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
+                               if (endlogSegNo <= recycleSegNo &&
+                                       lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
                                         InstallXLogFileSegment(&endlogSegNo, path,
-                                                                                  true, &max_advance, true))
+                                                                                  true, recycleSegNo, true))
                                 {
                                         ereport(DEBUG2,
                                                         (errmsg("recycled transaction log file \"%s\"",
                                                                         xlde->d_name)));
                                         CheckpointStats.ckpt_segs_recycled++;
                                         /* Needn't recheck that slot on future iterations */
-                                       if (max_advance > 0)
-                                       {
-                                               endlogSegNo++;
-                                               max_advance--;
-                                       }
+                                       endlogSegNo++;
                                 }
                                 else
                                 {
@@ -7594,7 +7694,8 @@ LogCheckpointEnd(bool restartpoint)
         elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
                  "%d transaction log file(s) added, %d removed, %d recycled; "
                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
-                "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
+                "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+                "distance=%d kB, estimate=%d kB",
                  restartpoint ? "restartpoint" : "checkpoint",
                  CheckpointStats.ckpt_bufs_written,
                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
@@ -7606,7 +7707,48 @@ LogCheckpointEnd(bool restartpoint)
                  total_secs, total_usecs / 1000,
                  CheckpointStats.ckpt_sync_rels,
                  longest_secs, longest_usecs / 1000,
-                average_secs, average_usecs / 1000);
+                average_secs, average_usecs / 1000,
+                (int) (PrevCheckPointDistance / 1024.0),
+                (int) (CheckPointDistanceEstimate / 1024.0));
+}
+
+/*
+ * Update the estimate of distance between checkpoints.
+ *
+ * The estimate is used to calculate the number of WAL segments to keep
+ * preallocated, see XLOGFileSlop().
+ */
+static void
+UpdateCheckPointDistanceEstimate(uint64 nbytes)
+{
+       /*
+        * To estimate the number of segments consumed between checkpoints, keep
+        * a moving average of the amount of WAL generated in previous checkpoint
+        * cycles. However, if the load is bursty, with quiet periods and busy
+        * periods, we want to cater for the peak load. So instead of a plain
+        * moving average, let the average decline slowly if the previous cycle
+        * used less WAL than estimated, but bump it up immediately if it used
+        * more.
+        *
+        * When checkpoints are triggered by max_wal_size, this should converge to
+        * CheckpointSegments * XLOG_SEG_SIZE,
+        *
+        * Note: This doesn't pay any attention to what caused the checkpoint.
+        * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
+        * starting a base backup, are counted the same as those created
+        * automatically. The slow-decline will largely mask them out, if they are
+        * not frequent. If they are frequent, it seems reasonable to count them
+        * in as any others; if you issue a manual checkpoint every 5 minutes and
+        * never let a timed checkpoint happen, it makes sense to base the
+        * preallocation on that 5 minute interval rather than whatever
+        * checkpoint_timeout is set to.
+        */
+       PrevCheckPointDistance = nbytes;
+       if (CheckPointDistanceEstimate < nbytes)
+               CheckPointDistanceEstimate = nbytes;
+       else
+               CheckPointDistanceEstimate =
+                       (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
  }
  
  /*
@@ -7646,7 +7788,7 @@ CreateCheckPoint(int flags)
         XLogRecPtr      recptr;
         XLogCtlInsert *Insert = &XLogCtl->Insert;
         uint32          freespace;
-       XLogSegNo       _logSegNo;
+       XLogRecPtr      PriorRedoPtr;
         XLogRecPtr      curInsert;
         VirtualTransactionId *vxids;
         int                     nvxids;
@@ -7961,10 +8103,10 @@ CreateCheckPoint(int flags)
                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
  
         /*
-        * Select point at which we can truncate the log, which we base on the
-        * prior checkpoint's earliest info.
+        * Remember the prior checkpoint's redo pointer, used later to determine
+        * the point where the log can be truncated.
          */
-       XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
+       PriorRedoPtr = ControlFile->checkPointCopy.redo;
  
         /*
          * Update the control file.
@@ -8019,11 +8161,17 @@ CreateCheckPoint(int flags)
          * Delete old log files (those no longer needed even for previous
          * checkpoint or the standbys in XLOG streaming).
          */
-       if (_logSegNo)
+       if (PriorRedoPtr != InvalidXLogRecPtr)
         {
+               XLogSegNo       _logSegNo;
+
+               /* Update the average distance between checkpoints. */
+               UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
+
+               XLByteToSeg(PriorRedoPtr, _logSegNo);
                 KeepLogSeg(recptr, &_logSegNo);
                 _logSegNo--;
-               RemoveOldXlogFiles(_logSegNo, recptr);
+               RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
         }
  
         /*
@@ -8191,7 +8339,7 @@ CreateRestartPoint(int flags)
  {
         XLogRecPtr      lastCheckPointRecPtr;
         CheckPoint      lastCheckPoint;
-       XLogSegNo       _logSegNo;
+       XLogRecPtr      PriorRedoPtr;
         TimestampTz xtime;
  
         /*
@@ -8256,14 +8404,14 @@ CreateRestartPoint(int flags)
         /*
          * Update the shared RedoRecPtr so that the startup process can calculate
          * the number of segments replayed since last restartpoint, and request a
-        * restartpoint if it exceeds checkpoint_segments.
+        * restartpoint if it exceeds CheckPointSegments.
          *
          * Like in CreateCheckPoint(), hold off insertions to update it, although
          * during recovery this is just pro forma, because no WAL insertions are
          * happening.
          */
         WALInsertLockAcquireExclusive();
-       XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
+       RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
         WALInsertLockRelease();
  
         /* Also update the info_lck-protected copy */
@@ -8287,10 +8435,10 @@ CreateRestartPoint(int flags)
         CheckPointGuts(lastCheckPoint.redo, flags);
  
         /*
-        * Select point at which we can truncate the xlog, which we base on the
-        * prior checkpoint's earliest info.
+        * Remember the prior checkpoint's redo pointer, used later to determine
+        * the point at which we can truncate the log.
          */
-       XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
+       PriorRedoPtr = ControlFile->checkPointCopy.redo;
  
         /*
          * Update pg_control, using current time.  Check that it still shows
@@ -8317,12 +8465,18 @@ CreateRestartPoint(int flags)
          * checkpoint/restartpoint) to prevent the disk holding the xlog from
          * growing full.
          */
-       if (_logSegNo)
+       if (PriorRedoPtr != InvalidXLogRecPtr)
         {
                 XLogRecPtr      receivePtr;
                 XLogRecPtr      replayPtr;
                 TimeLineID      replayTLI;
                 XLogRecPtr      endptr;
+               XLogSegNo       _logSegNo;
+
+               /* Update the average distance between checkpoints/restartpoints. */
+               UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
+
+               XLByteToSeg(PriorRedoPtr, _logSegNo);
  
                 /*
                  * Get the current end of xlog replayed or received, whichever is
@@ -8351,7 +8505,7 @@ CreateRestartPoint(int flags)
                 if (RecoveryInProgress())
                         ThisTimeLineID = replayTLI;
  
-               RemoveOldXlogFiles(_logSegNo, endptr);
+               RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
  
                 /*
                  * Make more log segments if needed.  (Do this after recycling old log
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c

index cfad08d5528e992191c89139278ed4c159dad3ef..0dce6a8ffaa3f23a88bc55b22c0cbc98fff13268 100644 (file)
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -471,7 +471,7 @@ CheckpointerMain(void)
                                 "checkpoints are occurring too frequently (%d seconds apart)",
                                                                            elapsed_secs,
                                                                            elapsed_secs),
-                                                errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
+                                                errhint("Consider increasing the configuration parameter \"max_wal_size\".")));
  
                         /*
                          * Initialize checkpointer-private variables used during
@@ -749,11 +749,11 @@ IsCheckpointOnSchedule(double progress)
                 return false;
  
         /*
-        * Check progress against WAL segments written and checkpoint_segments.
+        * Check progress against WAL segments written and CheckPointSegments.
          *
          * We compare the current WAL insert location against the location
          * computed before calling CreateCheckPoint. The code in XLogInsert that
-        * actually triggers a checkpoint when checkpoint_segments is exceeded
+        * actually triggers a checkpoint when CheckPointSegments is exceeded
          * compares against RedoRecptr, so this is not completely accurate.
          * However, it's good enough for our purposes, we're only calculating an
          * estimate anyway.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index 2499bee7399c50dc7c5d129bd10258e84b7a97b7..d84dba7732ee78fdbe4ac96af0cef07ca2efcc4b 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -685,6 +685,9 @@ typedef struct
  #if XLOG_BLCKSZ < 1024 || XLOG_BLCKSZ > (1024*1024)
  #error XLOG_BLCKSZ must be between 1KB and 1MB
  #endif
+#if XLOG_SEG_SIZE < (1024*1024) || XLOG_BLCKSZ > (1024*1024*1024)
+#error XLOG_SEG_SIZE must be between 1MB and 1GB
+#endif
  
  static const char *memory_units_hint =
         gettext_noop("Valid units for this parameter are \"kB\", \"MB\", \"GB\", and \"TB\".");
@@ -706,6 +709,11 @@ static const unit_conversion memory_unit_conversion_table[] =
         { "MB",         GUC_UNIT_XBLOCKS,       1024 / (XLOG_BLCKSZ / 1024) },
         { "kB",         GUC_UNIT_XBLOCKS,       -(XLOG_BLCKSZ / 1024) },
  
+       { "TB",         GUC_UNIT_XSEGS,         (1024*1024*1024) / (XLOG_SEG_SIZE / 1024) },
+       { "GB",         GUC_UNIT_XSEGS,         (1024*1024) / (XLOG_SEG_SIZE / 1024) },
+       { "MB",         GUC_UNIT_XSEGS,         -(XLOG_SEG_SIZE / (1024 * 1024)) },
+       { "kB",         GUC_UNIT_XSEGS,         -(XLOG_SEG_SIZE / 1024) },
+
         { "" }          /* end of table marker */
  };
  
@@ -2146,15 +2154,27 @@ static struct config_int ConfigureNamesInt[] =
         },
  
         {
-               {"checkpoint_segments", PGC_SIGHUP, WAL_CHECKPOINTS,
-                       gettext_noop("Sets the maximum distance in log segments between automatic WAL checkpoints."),
-                       NULL
+               {"min_wal_size", PGC_SIGHUP, WAL_CHECKPOINTS,
+                       gettext_noop("Sets the minimum size to shrink the WAL to."),
+                       NULL,
+                       GUC_UNIT_XSEGS
                 },
-               &CheckPointSegments,
-               3, 1, INT_MAX,
+               &min_wal_size,
+               5, 2, INT_MAX,
                 NULL, NULL, NULL
         },
  
+       {
+               {"max_wal_size", PGC_SIGHUP, WAL_CHECKPOINTS,
+                       gettext_noop("Sets the WAL size that triggers a checkpoint."),
+                       NULL,
+                       GUC_UNIT_XSEGS
+               },
+               &max_wal_size,
+               8, 2, INT_MAX,
+               NULL, assign_max_wal_size, NULL
+       },
+
         {
                 {"checkpoint_timeout", PGC_SIGHUP, WAL_CHECKPOINTS,
                         gettext_noop("Sets the maximum time between automatic WAL checkpoints."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample

index 29d8485964d696cccc0d45e63c1644485973b1f1..f8f9ce18eca30803f16e5b75ac915436e0a91a16 100644 (file)
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -197,8 +197,9 @@
  
  # - Checkpoints -
  
-#checkpoint_segments = 3               # in logfile segments, min 1, 16MB each
  #checkpoint_timeout = 5min             # range 30s-1h
+#max_wal_size = 128MB                  # in logfile segments
+#min_wal_size = 80MB
  #checkpoint_completion_target = 0.5    # checkpoint target duration, 0.0 - 1.0
  #checkpoint_warning = 30s              # 0 disables
  
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h

index be27a85648665ba7aaa9f4d61c516feaf7dcb675..0e8e5873cc20c57323e866e6b2e57ab41703fa65 100644 (file)
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -89,7 +89,8 @@ extern XLogRecPtr XactLastRecEnd;
  extern bool reachedConsistency;
  
  /* these variables are GUC parameters related to XLOG */
-extern int     CheckPointSegments;
+extern int     min_wal_size;
+extern int     max_wal_size;
  extern int     wal_keep_segments;
  extern int     XLOGbuffers;
  extern int     XLogArchiveTimeout;
@@ -101,6 +102,8 @@ extern bool fullPageWrites;
  extern bool wal_log_hints;
  extern bool log_checkpoints;
  
+extern int     CheckPointSegments;
+
  /* WAL levels */
  typedef enum WalLevel
  {
@@ -246,6 +249,9 @@ extern bool CheckPromoteSignal(void);
  extern void WakeupRecovery(void);
  extern void SetWalWriterSleeping(bool sleeping);
  
+extern void assign_max_wal_size(int newval, void *extra);
+extern void assign_checkpoint_completion_target(double newval, void *extra);
+
  /*
   * Starting/stopping a base backup
   */
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h

index 22d3a6faea40b18993560f7572b2f6e0827650c9..d3100d1781ff0f49ca7da081062927be730c59e9 100644 (file)
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -207,6 +207,7 @@ typedef enum
  #define GUC_UNIT_KB                            0x1000  /* value is in kilobytes */
  #define GUC_UNIT_BLOCKS                        0x2000  /* value is in blocks */
  #define GUC_UNIT_XBLOCKS               0x3000  /* value is in xlog blocks */
+#define GUC_UNIT_XSEGS                 0x4000  /* value is in xlog segments */
  #define GUC_UNIT_MEMORY                        0xF000  /* mask for KB, BLOCKS, XBLOCKS */
  
  #define GUC_UNIT_MS                       0x10000      /* value is in milliseconds */
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Mon, 23 Feb 2015 16:53:02 +0000 (18:53 +0200)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Mon, 23 Feb 2015 16:53:02 +0000 (18:53 +0200)
doc/src/sgml/config.sgml		patch \| blob \| history
doc/src/sgml/perform.sgml		patch \| blob \| history
doc/src/sgml/wal.sgml		patch \| blob \| history
src/backend/access/transam/xlog.c		patch \| blob \| history
src/backend/postmaster/checkpointer.c		patch \| blob \| history
src/backend/utils/misc/guc.c		patch \| blob \| history
src/backend/utils/misc/postgresql.conf.sample		patch \| blob \| history
src/include/access/xlog.h		patch \| blob \| history
src/include/utils/guc.h		patch \| blob \| history