From 5d2b45e3f78a85639f30431181c06d4c3221c5a1 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Mon, 23 Feb 2015 20:55:17 +0900 Subject: [PATCH] Add GUC to control the time to wait before retrieving WAL after failed attempt. Previously when the standby server failed to retrieve WAL files from any sources (i.e., streaming replication, local pg_xlog directory or WAL archive), it always waited for five seconds (hard-coded) before the next attempt. For example, this is problematic in warm-standby because restore_command can fail every five seconds even while new WAL file is expected to be unavailable for a long time and flood the log files with its error messages. This commit adds new parameter, wal_retrieve_retry_interval, to control that wait time. Alexey Vasiliev and Michael Paquier, reviewed by Andres Freund and me. --- doc/src/sgml/config.sgml | 18 +++++++++ src/backend/access/transam/xlog.c | 37 ++++++++++++------- src/backend/utils/misc/guc.c | 12 ++++++ src/backend/utils/misc/postgresql.conf.sample | 2 + src/include/access/xlog.h | 1 + 5 files changed, 57 insertions(+), 13 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 6bcb106518..a3917aac78 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2985,6 +2985,24 @@ include_dir 'conf.d' + + wal_retrieve_retry_interval (integer) + + wal_retrieve_retry_interval configuration parameter + + + + + Specify how long the standby server should wait when WAL data is not + available from any sources (streaming replication, + local pg_xlog or WAL archive) before retrying to + retrieve WAL data. This parameter can only be set in the + postgresql.conf file or on the server command line. + The default value is 5 seconds. Units are milliseconds if not specified. + + + + diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 629a457965..f68f82b255 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -93,6 +93,7 @@ int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; int CommitDelay = 0; /* precommit delay in microseconds */ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ +int wal_retrieve_retry_interval = 5000; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -10340,8 +10341,8 @@ static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr) { - static pg_time_t last_fail_time = 0; - pg_time_t now; + static TimestampTz last_fail_time = 0; + TimestampTz now; /*------- * Standby mode is implemented by a state machine: @@ -10351,7 +10352,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * 2. Check trigger file * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) * 4. Rescan timelines - * 5. Sleep 5 seconds, and loop back to 1. + * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1. * * Failure to read from the current source advances the state machine to * the next state. @@ -10490,14 +10491,25 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * machine, so we've exhausted all the options for * obtaining the requested WAL. We're going to loop back * and retry from the archive, but if it hasn't been long - * since last attempt, sleep 5 seconds to avoid - * busy-waiting. + * since last attempt, sleep wal_retrieve_retry_interval + * milliseconds to avoid busy-waiting. */ - now = (pg_time_t) time(NULL); - if ((now - last_fail_time) < 5) + now = GetCurrentTimestamp(); + if (!TimestampDifferenceExceeds(last_fail_time, now, + wal_retrieve_retry_interval)) { - pg_usleep(1000000L * (5 - (now - last_fail_time))); - now = (pg_time_t) time(NULL); + long secs, wait_time; + int usecs; + + TimestampDifference(last_fail_time, now, &secs, &usecs); + wait_time = wal_retrieve_retry_interval - + (secs * 1000 + usecs / 1000); + + WaitLatch(&XLogCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + wait_time); + ResetLatch(&XLogCtl->recoveryWakeupLatch); + now = GetCurrentTimestamp(); } last_fail_time = now; currentSource = XLOG_FROM_ARCHIVE; @@ -10653,12 +10665,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, } /* - * Wait for more WAL to arrive. Time out after 5 seconds, - * like when polling the archive, to react to a trigger - * file promptly. + * Wait for more WAL to arrive. Time out after 5 seconds + * to react to a trigger file promptly. */ WaitLatch(&XLogCtl->recoveryWakeupLatch, - WL_LATCH_SET | WL_TIMEOUT, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 5000L); ResetLatch(&XLogCtl->recoveryWakeupLatch); break; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 95727776d3..cf401d3cf0 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2363,6 +2363,18 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"wal_retrieve_retry_interval", PGC_SIGHUP, REPLICATION_STANDBY, + gettext_noop("Sets the time to wait before retrying to retrieve WAL" + "after a failed attempt."), + NULL, + GUC_UNIT_MS + }, + &wal_retrieve_retry_interval, + 5000, 1, INT_MAX, + NULL, NULL, NULL + }, + { {"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS, gettext_noop("Shows the number of pages per write ahead log segment."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index b053659f88..29d8485964 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -260,6 +260,8 @@ #wal_receiver_timeout = 60s # time that receiver waits for # communication from master # in milliseconds; 0 disables +#wal_retrieve_retry_interval = 5s # time to wait before retrying to + # retrieve WAL after a failed attempt #------------------------------------------------------------------------------ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 138deaf7c8..be27a85648 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -93,6 +93,7 @@ extern int CheckPointSegments; extern int wal_keep_segments; extern int XLOGbuffers; extern int XLogArchiveTimeout; +extern int wal_retrieve_retry_interval; extern bool XLogArchiveMode; extern char *XLogArchiveCommand; extern bool EnableHotStandby; -- 2.40.0