Allow time delayed standbys and recovery

author Simon Riggs <simon@2ndQuadrant.com>

Thu, 12 Dec 2013 10:53:20 +0000 (10:53 +0000)

committer Simon Riggs <simon@2ndQuadrant.com>

Thu, 12 Dec 2013 10:53:20 +0000 (10:53 +0000)
author Simon Riggs <simon@2ndQuadrant.com>
Thu, 12 Dec 2013 10:53:20 +0000 (10:53 +0000)
committer Simon Riggs <simon@2ndQuadrant.com>
Thu, 12 Dec 2013 10:53:20 +0000 (10:53 +0000)
diff --git a/doc/src/sgml/recovery-config.sgml b/doc/src/sgml/recovery-config.sgml

index 9d80256a5568665925b194e4ee737dd4c1ffda8d..ee5dc8687e2ee06b9aa40030a5aac13c2d5cff44 100644 (file)
--- a/doc/src/sgml/recovery-config.sgml
+++ b/doc/src/sgml/recovery-config.sgml
@@ -142,6 +142,56 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"'  # Windows
        </listitem>
       </varlistentry>
  
+     <varlistentry id="min-recovery-apply-delay" xreflabel="min_recovery_apply_delay">
+      <term><varname>min_recovery_apply_delay</varname> (<type>integer</type>)</term>
+      <indexterm>
+        <primary><varname>min_recovery_apply_delay</> recovery parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        By default, a standby server keeps restoring WAL records from the
+        primary as soon as possible. It may be useful to have a time-delayed
+        copy of the data, offering various options to correct data loss errors.
+        This paramater allows you to delay recovery by a fixed period of time,
+        specified in milliseconds if no unit is specified.  For example, if
+        you set this parameter to <literal>5min</literal>, the standby will
+        replay each transaction commit only when the system time on the standby
+        is at least five minutes past the commit time reported by the master.
+       </para>
+       <para>
+        It is possible that the replication delay between servers exceeds the
+        value of this parameter, in which case no delay is added.
+        Note that the delay is calculated between the WAL timestamp as written
+        on master and the time on the current standby. Delays
+        in transfer because of networks or cascading replication configurations
+        may reduce the actual wait time significantly. If the system
+        clocks on master and standby are not synchronised, this may lead to
+        recovery applying records earlier than expected but is not a major issue
+        because the useful settings of the parameter are much larger than
+        typical time deviation between the servers. Be careful to allow for
+        different timezone settings on master and standby.
+       </para>
+       <para>
+        The delay occurs only on WAL records for COMMIT and Restore Points.
+        Other records may be replayed earlier than the specified delay, which
+        is not an issue for MVCC though may potentially increase the number
+        of recovery conflicts generated.
+       </para>
+       <para>
+        The delay occurs until the standby is promoted or triggered. After that
+        the standby will end recovery without further waiting.
+       </para>
+       <para>
+        This parameter is intended for use with streaming replication deployments,
+        however, if the parameter is specified it will be honoured in all cases.
+        Synchronous replication is not affected by this setting because there is
+        not yet any setting to request synchronous apply of transaction commits.
+        <varname>hot_standby_feedback</> will be delayed by use of this feature
+        which could lead to bloat on the master; use both together with care.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
  
    </sect1>
diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample

index 5acfa572f0eec8d25c85d9627f69306bb938cb7a..673605cfc661d1a1bc69e78f7a79f4c0fbb7ac9e 100644 (file)
--- a/src/backend/access/transam/recovery.conf.sample
+++ b/src/backend/access/transam/recovery.conf.sample
@@ -123,6 +123,15 @@
  #
  #trigger_file = ''
  #
+# By default, a standby server keeps restoring XLOG records from the
+# primary as soon as possible. If you want to explicitly delay the replay of
+# committed transactions from the master, specify a recovery apply delay.
+# For example, if you set this parameter to 5min, the standby will replay
+# each transaction commit only when the system time on the standby is least
+# five minutes past the commit time reported by the master.
+#
+#min_recovery_apply_delay = 0
+#
  #---------------------------------------------------------------------------
  # HOT STANDBY PARAMETERS
  #---------------------------------------------------------------------------
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index 6fa5479c92b84b75819fc702f6a4d35be3ad638a..a76aef37f3d2f6066153fa6080d0183efe224390 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -218,6 +218,8 @@ static bool recoveryPauseAtTarget = true;
  static TransactionId recoveryTargetXid;
  static TimestampTz recoveryTargetTime;
  static char *recoveryTargetName;
+static int min_recovery_apply_delay = 0;
+static TimestampTz recoveryDelayUntilTime;
  
  /* options taken from recovery.conf for XLOG streaming */
  static bool StandbyModeRequested = false;
@@ -728,8 +730,10 @@ static bool holdingAllSlots = false;
  
  static void readRecoveryCommandFile(void);
  static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
-static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
+static bool recoveryStopsHere(XLogRecord *record, bool *includeThis, bool *delayThis);
  static void recoveryPausesHere(void);
+static void recoveryApplyDelay(void);
+static bool SetRecoveryDelayUntilTime(TimestampTz xtime);
  static void SetLatestXTime(TimestampTz xtime);
  static void SetCurrentChunkStartTime(TimestampTz xtime);
  static void CheckRequiredParameterValues(void);
@@ -5476,6 +5480,19 @@ readRecoveryCommandFile(void)
                                         (errmsg_internal("trigger_file = '%s'",
                                                                          TriggerFile)));
                 }
+               else if (strcmp(item->name, "min_recovery_apply_delay") == 0)
+               {
+                       const char *hintmsg;
+
+                       if (!parse_int(item->value, &min_recovery_apply_delay, GUC_UNIT_MS,
+                                       &hintmsg))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("parameter \"%s\" requires a temporal value", "min_recovery_apply_delay"),
+                                                hintmsg ? errhint("%s", _(hintmsg)) : 0));
+                       ereport(DEBUG2,
+                                       (errmsg("min_recovery_apply_delay = '%s'", item->value)));
+               }
                 else
                         ereport(FATAL,
                                         (errmsg("unrecognized recovery parameter \"%s\"",
@@ -5625,10 +5642,11 @@ exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
   * We also track the timestamp of the latest applied COMMIT/ABORT
   * record in XLogCtl->recoveryLastXTime, for logging purposes.
   * Also, some information is saved in recoveryStopXid et al for use in
- * annotating the new timeline's history file.
+ * annotating the new timeline's history file; and recoveryDelayUntilTime
+ * is updated, for time-delayed standbys.
   */
  static bool
-recoveryStopsHere(XLogRecord *record, bool *includeThis)
+recoveryStopsHere(XLogRecord *record, bool *includeThis, bool *delayThis)
  {
         bool            stopsHere;
         uint8           record_info;
@@ -5645,6 +5663,8 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
  
                 recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
                 recordXtime = recordXactCommitData->xact_time;
+
+               *delayThis = SetRecoveryDelayUntilTime(recordXactCommitData->xact_time);
         }
         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
         {
@@ -5652,6 +5672,8 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
  
                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
                 recordXtime = recordXactCommitData->xact_time;
+
+               *delayThis = SetRecoveryDelayUntilTime(recordXactCommitData->xact_time);
         }
         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
         {
@@ -5659,6 +5681,13 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
  
                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
                 recordXtime = recordXactAbortData->xact_time;
+
+               /*
+                * We deliberately choose not to delay aborts since they have no
+                * effect on MVCC. We already allow replay of records that don't
+                * have a timestamp, so there is already opportunity for issues
+                * caused by early conflicts on standbys.
+                */
         }
         else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
         {
@@ -5667,6 +5696,8 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
                 recordXtime = recordRestorePointData->rp_time;
                 strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
+
+               *delayThis = SetRecoveryDelayUntilTime(recordRestorePointData->rp_time);
         }
         else
                 return false;
@@ -5833,6 +5864,66 @@ SetRecoveryPause(bool recoveryPause)
         SpinLockRelease(&xlogctl->info_lck);
  }
  
+static bool
+SetRecoveryDelayUntilTime(TimestampTz xtime)
+{
+       if (min_recovery_apply_delay != 0)
+       {
+               recoveryDelayUntilTime =
+                       TimestampTzPlusMilliseconds(xtime, min_recovery_apply_delay);
+
+               return true;
+       }
+
+       return false;
+}
+/*
+ * When min_recovery_apply_delay is set, we wait long enough to make sure
+ * certain record types are applied at least that interval behind the master.
+ * See recoveryStopsHere().
+ *
+ * Note that the delay is calculated between the WAL record log time and
+ * the current time on standby. We would prefer to keep track of when this
+ * standby received each WAL record, which would allow a more consistent
+ * approach and one not affected by time synchronisation issues, but that
+ * is significantly more effort and complexity for little actual gain in
+ * usability.
+ */
+static void
+recoveryApplyDelay(void)
+{
+       while (true)
+       {
+               long    secs;
+               int             microsecs;
+
+               ResetLatch(&XLogCtl->recoveryWakeupLatch);
+
+               /* might change the trigger file's location */
+               HandleStartupProcInterrupts();
+
+               if (CheckForStandbyTrigger())
+                       break;
+
+               /*
+                * Wait for difference between GetCurrentTimestamp() and
+                * recoveryDelayUntilTime
+                */
+               TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
+                                                       &secs, &microsecs);
+
+               if (secs <= 0 && microsecs <=0)
+                       break;
+
+               elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
+                       secs, microsecs / 1000);
+
+               WaitLatch(&XLogCtl->recoveryWakeupLatch,
+                                       WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                                       secs * 1000L + microsecs / 1000);
+       }
+}
+
  /*
   * Save timestamp of latest processed commit/abort record.
   *
@@ -6660,6 +6751,7 @@ StartupXLOG(void)
                 {
                         bool            recoveryContinue = true;
                         bool            recoveryApply = true;
+                       bool            recoveryDelay = false;
                         ErrorContextCallback errcallback;
                         TimestampTz xtime;
  
@@ -6719,7 +6811,7 @@ StartupXLOG(void)
                                 /*
                                  * Have we reached our recovery target?
                                  */
-                               if (recoveryStopsHere(record, &recoveryApply))
+                               if (recoveryStopsHere(record, &recoveryApply, &recoveryDelay))
                                 {
                                         if (recoveryPauseAtTarget)
                                         {
@@ -6734,6 +6826,25 @@ StartupXLOG(void)
                                                 break;
                                 }
  
+                               /*
+                                * If we've been asked to lag the master, wait on
+                                * latch until enough time has passed.
+                                */
+                               if (recoveryDelay)
+                               {
+                                       recoveryApplyDelay();
+
+                                       /*
+                                        * We test for paused recovery again here. If
+                                        * user sets delayed apply, it may be because
+                                        * they expect to pause recovery in case of
+                                        * problems, so we must test again here otherwise
+                                        * pausing during the delay-wait wouldn't work.
+                                        */
+                                       if (xlogctl->recoveryPause)
+                                               recoveryPausesHere();
+                               }
+
                                 /* Setup error traceback support for ereport() */
                                 errcallback.callback = rm_redo_error_callback;
                                 errcallback.arg = (void *) record;
author	Simon Riggs <simon@2ndQuadrant.com>
	Thu, 12 Dec 2013 10:53:20 +0000 (10:53 +0000)
committer	Simon Riggs <simon@2ndQuadrant.com>
	Thu, 12 Dec 2013 10:53:20 +0000 (10:53 +0000)
doc/src/sgml/recovery-config.sgml		patch \| blob \| history
src/backend/access/transam/recovery.conf.sample		patch \| blob \| history
src/backend/access/transam/xlog.c		patch \| blob \| history