]> granicus.if.org Git - postgresql/commitdiff
Add GUC to control the time to wait before retrieving WAL after failed attempt.
authorFujii Masao <fujii@postgresql.org>
Mon, 23 Feb 2015 11:55:17 +0000 (20:55 +0900)
committerFujii Masao <fujii@postgresql.org>
Mon, 23 Feb 2015 11:55:17 +0000 (20:55 +0900)
Previously when the standby server failed to retrieve WAL files from any sources
(i.e., streaming replication, local pg_xlog directory or WAL archive), it always
waited for five seconds (hard-coded) before the next attempt. For example,
this is problematic in warm-standby because restore_command can fail
every five seconds even while new WAL file is expected to be unavailable for
a long time and flood the log files with its error messages.

This commit adds new parameter, wal_retrieve_retry_interval, to control that
wait time.

Alexey Vasiliev and Michael Paquier, reviewed by Andres Freund and me.

doc/src/sgml/config.sgml
src/backend/access/transam/xlog.c
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/include/access/xlog.h

index 6bcb106518e4eefe33c9b69129d737f93a1646be..a3917aac7855bc3c71f7e6bb6b3c78078d981314 100644 (file)
@@ -2985,6 +2985,24 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-wal-retrieve-retry-interval" xreflabel="wal_retrieve_retry_interval">
+      <term><varname>wal_retrieve_retry_interval</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>wal_retrieve_retry_interval</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specify how long the standby server should wait when WAL data is not
+        available from any sources (streaming replication,
+        local <filename>pg_xlog</> or WAL archive) before retrying to
+        retrieve WAL data.  This parameter can only be set in the
+        <filename>postgresql.conf</> file or on the server command line.
+        The default value is 5 seconds. Units are milliseconds if not specified.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
     </sect2>
    </sect1>
index 629a457965ff2e6987bd6ccd5ee0cabb26fe212c..f68f82b255c0608d7a7f958865a2abe3d179911c 100644 (file)
@@ -93,6 +93,7 @@ int                   sync_method = DEFAULT_SYNC_METHOD;
 int                    wal_level = WAL_LEVEL_MINIMAL;
 int                    CommitDelay = 0;        /* precommit delay in microseconds */
 int                    CommitSiblings = 5; /* # concurrent xacts needed to sleep */
+int                    wal_retrieve_retry_interval = 5000;
 
 #ifdef WAL_DEBUG
 bool           XLOG_DEBUG = false;
@@ -10340,8 +10341,8 @@ static bool
 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                                                        bool fetching_ckpt, XLogRecPtr tliRecPtr)
 {
-       static pg_time_t last_fail_time = 0;
-       pg_time_t       now;
+       static TimestampTz      last_fail_time = 0;
+       TimestampTz     now;
 
        /*-------
         * Standby mode is implemented by a state machine:
@@ -10351,7 +10352,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
         * 2. Check trigger file
         * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
         * 4. Rescan timelines
-        * 5. Sleep seconds, and loop back to 1.
+        * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
         *
         * Failure to read from the current source advances the state machine to
         * the next state.
@@ -10490,14 +10491,25 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                                         * machine, so we've exhausted all the options for
                                         * obtaining the requested WAL. We're going to loop back
                                         * and retry from the archive, but if it hasn't been long
-                                        * since last attempt, sleep 5 seconds to avoid
-                                        * busy-waiting.
+                                        * since last attempt, sleep wal_retrieve_retry_interval
+                                        * milliseconds to avoid busy-waiting.
                                         */
-                                       now = (pg_time_t) time(NULL);
-                                       if ((now - last_fail_time) < 5)
+                                       now = GetCurrentTimestamp();
+                                       if (!TimestampDifferenceExceeds(last_fail_time, now,
+                                                                                                       wal_retrieve_retry_interval))
                                        {
-                                               pg_usleep(1000000L * (5 - (now - last_fail_time)));
-                                               now = (pg_time_t) time(NULL);
+                                               long            secs, wait_time;
+                                               int                     usecs;
+
+                                               TimestampDifference(last_fail_time, now, &secs, &usecs);
+                                               wait_time = wal_retrieve_retry_interval -
+                                                       (secs * 1000 + usecs / 1000);
+
+                                               WaitLatch(&XLogCtl->recoveryWakeupLatch,
+                                                                 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                                                                 wait_time);
+                                               ResetLatch(&XLogCtl->recoveryWakeupLatch);
+                                               now = GetCurrentTimestamp();
                                        }
                                        last_fail_time = now;
                                        currentSource = XLOG_FROM_ARCHIVE;
@@ -10653,12 +10665,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                                        }
 
                                        /*
-                                        * Wait for more WAL to arrive. Time out after 5 seconds,
-                                        * like when polling the archive, to react to a trigger
-                                        * file promptly.
+                                        * Wait for more WAL to arrive. Time out after 5 seconds
+                                        * to react to a trigger file promptly.
                                         */
                                        WaitLatch(&XLogCtl->recoveryWakeupLatch,
-                                                         WL_LATCH_SET | WL_TIMEOUT,
+                                                         WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
                                                          5000L);
                                        ResetLatch(&XLogCtl->recoveryWakeupLatch);
                                        break;
index 95727776d3851a2d1a55ca3d3fa824a7cf492bbc..cf401d3cf03ecbb30987151ae0e8784e14033a12 100644 (file)
@@ -2363,6 +2363,18 @@ static struct config_int ConfigureNamesInt[] =
                NULL, NULL, NULL
        },
 
+       {
+               {"wal_retrieve_retry_interval", PGC_SIGHUP, REPLICATION_STANDBY,
+                       gettext_noop("Sets the time to wait before retrying to retrieve WAL"
+                                                "after a failed attempt."),
+                       NULL,
+                       GUC_UNIT_MS
+               },
+               &wal_retrieve_retry_interval,
+               5000, 1, INT_MAX,
+               NULL, NULL, NULL
+       },
+
        {
                {"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS,
                        gettext_noop("Shows the number of pages per write ahead log segment."),
index b053659f88e85a2aff7b2ddcbb12b0ca4b2897d0..29d8485964d696cccc0d45e63c1644485973b1f1 100644 (file)
 #wal_receiver_timeout = 60s            # time that receiver waits for
                                        # communication from master
                                        # in milliseconds; 0 disables
+#wal_retrieve_retry_interval = 5s      # time to wait before retrying to
+                                       # retrieve WAL after a failed attempt
 
 
 #------------------------------------------------------------------------------
index 138deaf7c8fa393507c59eec49e3b2cbee67d5a3..be27a85648665ba7aaa9f4d61c516feaf7dcb675 100644 (file)
@@ -93,6 +93,7 @@ extern int    CheckPointSegments;
 extern int     wal_keep_segments;
 extern int     XLOGbuffers;
 extern int     XLogArchiveTimeout;
+extern int     wal_retrieve_retry_interval;
 extern bool XLogArchiveMode;
 extern char *XLogArchiveCommand;
 extern bool EnableHotStandby;