granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <time.h>
  19 #include <fcntl.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <unistd.h>
  23
  24 #include "access/clog.h"
  25 #include "access/multixact.h"
  26 #include "access/subtrans.h"
  27 #include "access/timeline.h"
  28 #include "access/transam.h"
  29 #include "access/tuptoaster.h"
  30 #include "access/twophase.h"
  31 #include "access/xact.h"
  32 #include "access/xlog_internal.h"
  33 #include "access/xlogreader.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_database.h"
  38 #include "miscadmin.h"
  39 #include "pgstat.h"
  40 #include "postmaster/bgwriter.h"
  41 #include "postmaster/startup.h"
  42 #include "replication/walreceiver.h"
  43 #include "replication/walsender.h"
  44 #include "storage/bufmgr.h"
  45 #include "storage/fd.h"
  46 #include "storage/ipc.h"
  47 #include "storage/latch.h"
  48 #include "storage/pmsignal.h"
  49 #include "storage/predicate.h"
  50 #include "storage/proc.h"
  51 #include "storage/procarray.h"
  52 #include "storage/reinit.h"
  53 #include "storage/smgr.h"
  54 #include "storage/spin.h"
  55 #include "utils/builtins.h"
  56 #include "utils/guc.h"
  57 #include "utils/ps_status.h"
  58 #include "utils/relmapper.h"
  59 #include "utils/snapmgr.h"
  60 #include "utils/timestamp.h"
  61 #include "pg_trace.h"
  62
  63 extern uint32 bootstrap_data_checksum_version;
  64
  65 /* File path names (all relative to $PGDATA) */
  66 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  67 #define RECOVERY_COMMAND_DONE   "recovery.done"
  68 #define PROMOTE_SIGNAL_FILE "promote"
  69 #define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
  70
  71
  72 /* User-settable parameters */
  73 int                     CheckPointSegments = 3;
  74 int                     wal_keep_segments = 0;
  75 int                     XLOGbuffers = -1;
  76 int                     XLogArchiveTimeout = 0;
  77 bool            XLogArchiveMode = false;
  78 char       *XLogArchiveCommand = NULL;
  79 bool            EnableHotStandby = false;
  80 bool            fullPageWrites = true;
  81 bool            log_checkpoints = false;
  82 int                     sync_method = DEFAULT_SYNC_METHOD;
  83 int                     wal_level = WAL_LEVEL_MINIMAL;
  84 int                     CommitDelay = 0;        /* precommit delay in microseconds */
  85 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  86
  87 #ifdef WAL_DEBUG
  88 bool            XLOG_DEBUG = false;
  89 #endif
  90
  91 /*
  92  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  93  * When we are done with an old XLOG segment file, we will recycle it as a
  94  * future XLOG segment as long as there aren't already XLOGfileslop future
  95  * segments; else we'll delete it.  This could be made a separate GUC
  96  * variable, but at present I think it's sufficient to hardwire it as
  97  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  98  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  99  * of them; the +1 allows boundary cases to happen without wasting a
 100  * delete/create-segment cycle.
 101  */
 102 #define XLOGfileslop    (2*CheckPointSegments + 1)
 103
 104
 105 /*
 106  * GUC support
 107  */
 108 const struct config_enum_entry sync_method_options[] = {
 109         {"fsync", SYNC_METHOD_FSYNC, false},
 110 #ifdef HAVE_FSYNC_WRITETHROUGH
 111         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 112 #endif
 113 #ifdef HAVE_FDATASYNC
 114         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 115 #endif
 116 #ifdef OPEN_SYNC_FLAG
 117         {"open_sync", SYNC_METHOD_OPEN, false},
 118 #endif
 119 #ifdef OPEN_DATASYNC_FLAG
 120         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 121 #endif
 122         {NULL, 0, false}
 123 };
 124
 125 /*
 126  * Statistics for current checkpoint are collected in this global struct.
 127  * Because only the background writer or a stand-alone backend can perform
 128  * checkpoints, this will be unused in normal backends.
 129  */
 130 CheckpointStatsData CheckpointStats;
 131
 132 /*
 133  * ThisTimeLineID will be same in all backends --- it identifies current
 134  * WAL timeline for the database system.
 135  */
 136 TimeLineID      ThisTimeLineID = 0;
 137
 138 /*
 139  * Are we doing recovery from XLOG?
 140  *
 141  * This is only ever true in the startup process; it should be read as meaning
 142  * "this process is replaying WAL records", rather than "the system is in
 143  * recovery mode".  It should be examined primarily by functions that need
 144  * to act differently when called from a WAL redo function (e.g., to skip WAL
 145  * logging).  To check whether the system is in recovery regardless of which
 146  * process you're running in, use RecoveryInProgress() but only after shared
 147  * memory startup and lock initialization.
 148  */
 149 bool            InRecovery = false;
 150
 151 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 152 HotStandbyState standbyState = STANDBY_DISABLED;
 153
 154 static XLogRecPtr LastRec;
 155
 156 /* Local copy of WalRcv->receivedUpto */
 157 static XLogRecPtr receivedUpto = 0;
 158 static TimeLineID receiveTLI = 0;
 159
 160 /*
 161  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 162  * the replayed WAL records indicate. It's initialized with full_page_writes
 163  * that the recovery starting checkpoint record indicates, and then updated
 164  * each time XLOG_FPW_CHANGE record is replayed.
 165  */
 166 static bool lastFullPageWrites;
 167
 168 /*
 169  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 170  * known, need to check the shared state".
 171  */
 172 static bool LocalRecoveryInProgress = true;
 173
 174 /*
 175  * Local copy of SharedHotStandbyActive variable. False actually means "not
 176  * known, need to check the shared state".
 177  */
 178 static bool LocalHotStandbyActive = false;
 179
 180 /*
 181  * Local state for XLogInsertAllowed():
 182  *              1: unconditionally allowed to insert XLOG
 183  *              0: unconditionally not allowed to insert XLOG
 184  *              -1: must check RecoveryInProgress(); disallow until it is false
 185  * Most processes start with -1 and transition to 1 after seeing that recovery
 186  * is not in progress.  But we can also force the value for special cases.
 187  * The coding in XLogInsertAllowed() depends on the first two of these states
 188  * being numerically the same as bool true and false.
 189  */
 190 static int      LocalXLogInsertAllowed = -1;
 191
 192 /*
 193  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 194  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 195  * currently recovering using offline XLOG archives. These variables are only
 196  * valid in the startup process.
 197  *
 198  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 199  * currently performing crash recovery using only XLOG files in pg_xlog, but
 200  * will switch to using offline XLOG archives as soon as we reach the end of
 201  * WAL in pg_xlog.
 202 */
 203 bool            ArchiveRecoveryRequested = false;
 204 bool            InArchiveRecovery = false;
 205
 206 /* Was the last xlog file restored from archive, or local? */
 207 static bool restoredFromArchive = false;
 208
 209 /* options taken from recovery.conf for archive recovery */
 210 char       *recoveryRestoreCommand = NULL;
 211 static char *recoveryEndCommand = NULL;
 212 static char *archiveCleanupCommand = NULL;
 213 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 214 static bool recoveryTargetInclusive = true;
 215 static bool recoveryPauseAtTarget = true;
 216 static TransactionId recoveryTargetXid;
 217 static TimestampTz recoveryTargetTime;
 218 static char *recoveryTargetName;
 219
 220 /* options taken from recovery.conf for XLOG streaming */
 221 static bool StandbyModeRequested = false;
 222 static char *PrimaryConnInfo = NULL;
 223 static char *TriggerFile = NULL;
 224
 225 /* are we currently in standby mode? */
 226 bool            StandbyMode = false;
 227
 228 /* whether request for fast promotion has been made yet */
 229 static bool fast_promote = false;
 230
 231 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
 232 static TransactionId recoveryStopXid;
 233 static TimestampTz recoveryStopTime;
 234 static char recoveryStopName[MAXFNAMELEN];
 235 static bool recoveryStopAfter;
 236
 237 /*
 238  * During normal operation, the only timeline we care about is ThisTimeLineID.
 239  * During recovery, however, things are more complicated.  To simplify life
 240  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 241  * scan through the WAL history (that is, it is the line that was active when
 242  * the currently-scanned WAL record was generated).  We also need these
 243  * timeline values:
 244  *
 245  * recoveryTargetTLI: the desired timeline that we want to end in.
 246  *
 247  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 248  *
 249  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 250  * its known parents, newest first (so recoveryTargetTLI is always the
 251  * first list member).  Only these TLIs are expected to be seen in the WAL
 252  * segments we read, and indeed only these TLIs will be considered as
 253  * candidate WAL files to open at all.
 254  *
 255  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 256  * (This is not necessarily the same as ThisTimeLineID, because we could
 257  * be scanning data that was copied from an ancestor timeline when the current
 258  * file was created.)  During a sequential scan we do not allow this value
 259  * to decrease.
 260  */
 261 static TimeLineID recoveryTargetTLI;
 262 static bool recoveryTargetIsLatest = false;
 263 static List *expectedTLEs;
 264 static TimeLineID curFileTLI;
 265
 266 /*
 267  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 268  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 269  * end+1 of the last record, and is reset when we end a top-level transaction,
 270  * or start a new one; so it can be used to tell if the current transaction has
 271  * created any XLOG records.
 272  */
 273 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
 274
 275 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 276
 277 /*
 278  * RedoRecPtr is this backend's local copy of the REDO record pointer
 279  * (which is almost but not quite the same as a pointer to the most recent
 280  * CHECKPOINT record).  We update this from the shared-memory copy,
 281  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 282  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 283  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 284  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 285  * InitXLOGAccess.
 286  */
 287 static XLogRecPtr RedoRecPtr;
 288
 289 /*
 290  * RedoStartLSN points to the checkpoint's REDO location which is specified
 291  * in a backup label file, backup history file or control file. In standby
 292  * mode, XLOG streaming usually starts from the position where an invalid
 293  * record was found. But if we fail to read even the initial checkpoint
 294  * record, we use the REDO location instead of the checkpoint location as
 295  * the start position of XLOG streaming. Otherwise we would have to jump
 296  * backwards to the REDO location after reading the checkpoint record,
 297  * because the REDO record can precede the checkpoint record.
 298  */
 299 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 300
 301 /*----------
 302  * Shared-memory data structures for XLOG control
 303  *
 304  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 305  * the log up to (all records before that point must be written or fsynced).
 306  * LogwrtResult indicates the byte positions we have already written/fsynced.
 307  * These structs are identical but are declared separately to indicate their
 308  * slightly different functions.
 309  *
 310  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 311  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 312  * this arrangement is that the value can be examined by code that already
 313  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 314  * to the shared variable, each backend has a private copy of LogwrtResult,
 315  * which is updated when convenient.
 316  *
 317  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 318  * (protected by info_lck), but we don't need to cache any copies of it.
 319  *
 320  * info_lck is only held long enough to read/update the protected variables,
 321  * so it's a plain spinlock.  The other locks are held longer (potentially
 322  * over I/O operations), so we use LWLocks for them.  These locks are:
 323  *
 324  * WALInsertLock: must be held to insert a record into the WAL buffers.
 325  *
 326  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 327  * XLogFlush).
 328  *
 329  * ControlFileLock: must be held to read/update control file or create
 330  * new log file.
 331  *
 332  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 333  * only one checkpointer at a time; currently, with all checkpoints done by
 334  * the checkpointer, this is just pro forma).
 335  *
 336  *----------
 337  */
 338
 339 typedef struct XLogwrtRqst
 340 {
 341         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 342         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 343 } XLogwrtRqst;
 344
 345 typedef struct XLogwrtResult
 346 {
 347         XLogRecPtr      Write;                  /* last byte + 1 written out */
 348         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 349 } XLogwrtResult;
 350
 351 /*
 352  * Shared state data for XLogInsert.
 353  */
 354 typedef struct XLogCtlInsert
 355 {
 356         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 357         int                     curridx;                /* current block index in cache */
 358         XLogPageHeader currpage;        /* points to header of block in cache */
 359         char       *currpos;            /* current insertion point in cache */
 360         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 361         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 362
 363         /*
 364          * fullPageWrites is the master copy used by all backends to determine
 365          * whether to write full-page to WAL, instead of using process-local one.
 366          * This is required because, when full_page_writes is changed by SIGHUP,
 367          * we must WAL-log it before it actually affects WAL-logging by backends.
 368          * Checkpointer sets at startup or after SIGHUP.
 369          */
 370         bool            fullPageWrites;
 371
 372         /*
 373          * exclusiveBackup is true if a backup started with pg_start_backup() is
 374          * in progress, and nonExclusiveBackups is a counter indicating the number
 375          * of streaming base backups currently in progress. forcePageWrites is set
 376          * to true when either of these is non-zero. lastBackupStart is the latest
 377          * checkpoint redo location used as a starting point for an online backup.
 378          */
 379         bool            exclusiveBackup;
 380         int                     nonExclusiveBackups;
 381         XLogRecPtr      lastBackupStart;
 382 } XLogCtlInsert;
 383
 384 /*
 385  * Shared state data for XLogWrite/XLogFlush.
 386  */
 387 typedef struct XLogCtlWrite
 388 {
 389         int                     curridx;                /* cache index of next block to write */
 390         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 391 } XLogCtlWrite;
 392
 393 /*
 394  * Total shared-memory state for XLOG.
 395  */
 396 typedef struct XLogCtlData
 397 {
 398         /* Protected by WALInsertLock: */
 399         XLogCtlInsert Insert;
 400
 401         /* Protected by info_lck: */
 402         XLogwrtRqst LogwrtRqst;
 403         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 404         TransactionId ckptXid;
 405         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 406         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
 407                                                                                  * segment */
 408
 409         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck */
 410         XLogRecPtr      unloggedLSN;
 411         slock_t         ulsn_lck;
 412
 413         /* Protected by WALWriteLock: */
 414         XLogCtlWrite Write;
 415
 416         /*
 417          * Protected by info_lck and WALWriteLock (you must hold either lock to
 418          * read it, but both to update)
 419          */
 420         XLogwrtResult LogwrtResult;
 421
 422         /*
 423          * These values do not change after startup, although the pointed-to pages
 424          * and xlblocks values certainly do.  Permission to read/write the pages
 425          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 426          */
 427         char       *pages;                      /* buffers for unwritten XLOG pages */
 428         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 429         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 430
 431         /*
 432          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 433          * If we created a new timeline when the system was started up,
 434          * PrevTimeLineID is the old timeline's ID that we forked off from.
 435          * Otherwise it's equal to ThisTimeLineID.
 436          */
 437         TimeLineID      ThisTimeLineID;
 438         TimeLineID      PrevTimeLineID;
 439
 440         /*
 441          * archiveCleanupCommand is read from recovery.conf but needs to be in
 442          * shared memory so that the checkpointer process can access it.
 443          */
 444         char            archiveCleanupCommand[MAXPGPATH];
 445
 446         /*
 447          * SharedRecoveryInProgress indicates if we're still in crash or archive
 448          * recovery.  Protected by info_lck.
 449          */
 450         bool            SharedRecoveryInProgress;
 451
 452         /*
 453          * SharedHotStandbyActive indicates if we're still in crash or archive
 454          * recovery.  Protected by info_lck.
 455          */
 456         bool            SharedHotStandbyActive;
 457
 458         /*
 459          * WalWriterSleeping indicates whether the WAL writer is currently in
 460          * low-power mode (and hence should be nudged if an async commit occurs).
 461          * Protected by info_lck.
 462          */
 463         bool            WalWriterSleeping;
 464
 465         /*
 466          * recoveryWakeupLatch is used to wake up the startup process to continue
 467          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 468          * to appear.
 469          */
 470         Latch           recoveryWakeupLatch;
 471
 472         /*
 473          * During recovery, we keep a copy of the latest checkpoint record here.
 474          * Used by the background writer when it wants to create a restartpoint.
 475          *
 476          * Protected by info_lck.
 477          */
 478         XLogRecPtr      lastCheckPointRecPtr;
 479         CheckPoint      lastCheckPoint;
 480
 481         /*
 482          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 483          * replayed. When we're currently replaying a record, ie. in a redo
 484          * function, replayEndRecPtr points to the end+1 of the record being
 485          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 486          */
 487         XLogRecPtr      lastReplayedEndRecPtr;
 488         TimeLineID      lastReplayedTLI;
 489         XLogRecPtr      replayEndRecPtr;
 490         TimeLineID      replayEndTLI;
 491         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 492         TimestampTz recoveryLastXTime;
 493         /* current effective recovery target timeline */
 494         TimeLineID      RecoveryTargetTLI;
 495
 496         /*
 497          * timestamp of when we started replaying the current chunk of WAL data,
 498          * only relevant for replication or archive recovery
 499          */
 500         TimestampTz currentChunkStartTime;
 501         /* Are we requested to pause recovery? */
 502         bool            recoveryPause;
 503
 504         /*
 505          * lastFpwDisableRecPtr points to the start of the last replayed
 506          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 507          */
 508         XLogRecPtr      lastFpwDisableRecPtr;
 509
 510         slock_t         info_lck;               /* locks shared variables shown above */
 511 } XLogCtlData;
 512
 513 static XLogCtlData *XLogCtl = NULL;
 514
 515 /*
 516  * We maintain an image of pg_control in shared memory.
 517  */
 518 static ControlFileData *ControlFile = NULL;
 519
 520 /*
 521  * Macros for managing XLogInsert state.  In most cases, the calling routine
 522  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 523  * so these are passed as parameters instead of being fetched via XLogCtl.
 524  */
 525
 526 /* Free space remaining in the current xlog page buffer */
 527 #define INSERT_FREESPACE(Insert)  \
 528         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 529
 530 /* Construct XLogRecPtr value for current insertion point */
 531 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 532                 (recptr) = XLogCtl->xlblocks[curridx] - INSERT_FREESPACE(Insert)
 533
 534 #define PrevBufIdx(idx)         \
 535                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 536
 537 #define NextBufIdx(idx)         \
 538                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 539
 540 /*
 541  * Private, possibly out-of-date copy of shared LogwrtResult.
 542  * See discussion above.
 543  */
 544 static XLogwrtResult LogwrtResult = {0, 0};
 545
 546 /*
 547  * Codes indicating where we got a WAL file from during recovery, or where
 548  * to attempt to get one.
 549  */
 550 typedef enum
 551 {
 552         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 553         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 554         XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
 555         XLOG_FROM_STREAM,                       /* streamed from master */
 556 } XLogSource;
 557
 558 /* human-readable names for XLogSources, for debugging output */
 559 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
 560
 561 /*
 562  * openLogFile is -1 or a kernel FD for an open log file segment.
 563  * When it's open, openLogOff is the current seek offset in the file.
 564  * openLogSegNo identifies the segment.  These variables are only
 565  * used to write the XLOG, and so will normally refer to the active segment.
 566  */
 567 static int      openLogFile = -1;
 568 static XLogSegNo openLogSegNo = 0;
 569 static uint32 openLogOff = 0;
 570
 571 /*
 572  * These variables are used similarly to the ones above, but for reading
 573  * the XLOG.  Note, however, that readOff generally represents the offset
 574  * of the page just read, not the seek position of the FD itself, which
 575  * will be just past that page. readLen indicates how much of the current
 576  * page has been read into readBuf, and readSource indicates where we got
 577  * the currently open file from.
 578  */
 579 static int      readFile = -1;
 580 static XLogSegNo readSegNo = 0;
 581 static uint32 readOff = 0;
 582 static uint32 readLen = 0;
 583 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
 584
 585 /*
 586  * Keeps track of which source we're currently reading from. This is
 587  * different from readSource in that this is always set, even when we don't
 588  * currently have a WAL file open. If lastSourceFailed is set, our last
 589  * attempt to read from currentSource failed, and we should try another source
 590  * next.
 591  */
 592 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 593 static bool lastSourceFailed = false;
 594
 595 typedef struct XLogPageReadPrivate
 596 {
 597         int                     emode;
 598         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 599         bool            randAccess;
 600 } XLogPageReadPrivate;
 601
 602 /*
 603  * These variables track when we last obtained some WAL data to process,
 604  * and where we got it from.  (XLogReceiptSource is initially the same as
 605  * readSource, but readSource gets reset to zero when we don't have data
 606  * to process right now.  It is also different from currentSource, which
 607  * also changes when we try to read from a source and fail, while
 608  * XLogReceiptSource tracks where we last successfully read some WAL.)
 609  */
 610 static TimestampTz XLogReceiptTime = 0;
 611 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
 612
 613 /* State information for XLOG reading */
 614 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 615 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 616
 617 static XLogRecPtr minRecoveryPoint;             /* local copy of
 618                                                                                  * ControlFile->minRecoveryPoint */
 619 static TimeLineID minRecoveryPointTLI;
 620 static bool updateMinRecoveryPoint = true;
 621
 622 /*
 623  * Have we reached a consistent database state? In crash recovery, we have
 624  * to replay all the WAL, so reachedConsistency is never set. During archive
 625  * recovery, the database is consistent once minRecoveryPoint is reached.
 626  */
 627 bool            reachedConsistency = false;
 628
 629 static bool InRedo = false;
 630
 631 /* Have we launched bgwriter during recovery? */
 632 static bool bgwriterLaunched = false;
 633
 634
 635 static void readRecoveryCommandFile(void);
 636 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
 637 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 638 static void recoveryPausesHere(void);
 639 static void SetLatestXTime(TimestampTz xtime);
 640 static void SetCurrentChunkStartTime(TimestampTz xtime);
 641 static void CheckRequiredParameterValues(void);
 642 static void XLogReportParameters(void);
 643 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 644                                         TimeLineID prevTLI);
 645 static void LocalSetXLogInsertAllowed(void);
 646 static void CreateEndOfRecoveryRecord(void);
 647 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 648 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 649
 650 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
 651                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 652 static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
 653                                                  char *blk, bool get_cleanup_lock, bool keep_buffer);
 654 static bool AdvanceXLInsertBuffer(bool new_segment);
 655 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 656 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 657 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 658                                            bool find_free, int *max_advance,
 659                                            bool use_lock);
 660 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 661                          int source, bool notexistOk);
 662 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 663 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 664                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 665                          TimeLineID *readTLI);
 666 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 667                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 668 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 669 static void XLogFileClose(void);
 670 static void PreallocXlogFiles(XLogRecPtr endptr);
 671 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
 672 static void UpdateLastRemovedPtr(char *filename);
 673 static void ValidateXLOGDirectoryStructure(void);
 674 static void CleanupBackupHistory(void);
 675 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 676 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 677                    int emode, bool fetching_ckpt);
 678 static void CheckRecoveryConsistency(void);
 679 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 680                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 681 static bool rescanLatestTimeLine(void);
 682 static void WriteControlFile(void);
 683 static void ReadControlFile(void);
 684 static char *str_time(pg_time_t tnow);
 685 static bool CheckForStandbyTrigger(void);
 686
 687 #ifdef WAL_DEBUG
 688 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 689 #endif
 690 static void pg_start_backup_callback(int code, Datum arg);
 691 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 692                                   bool *backupEndRequired, bool *backupFromStandby);
 693 static void rm_redo_error_callback(void *arg);
 694 static int      get_sync_bit(int method);
 695
 696
 697 /*
 698  * Insert an XLOG record having the specified RMID and info bytes,
 699  * with the body of the record being the data chunk(s) described by
 700  * the rdata chain (see xlog.h for notes about rdata).
 701  *
 702  * Returns XLOG pointer to end of record (beginning of next record).
 703  * This can be used as LSN for data pages affected by the logged action.
 704  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 705  * before the data page can be written out.  This implements the basic
 706  * WAL rule "write the log before the data".)
 707  *
 708  * NB: this routine feels free to scribble on the XLogRecData structs,
 709  * though not on the data they reference.  This is OK since the XLogRecData
 710  * structs are always just temporaries in the calling code.
 711  */
 712 XLogRecPtr
 713 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 714 {
 715         XLogCtlInsert *Insert = &XLogCtl->Insert;
 716         XLogRecPtr      RecPtr;
 717         XLogRecPtr      WriteRqst;
 718         uint32          freespace;
 719         int                     curridx;
 720         XLogRecData *rdt;
 721         XLogRecData *rdt_lastnormal;
 722         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 723         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 724         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 725         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 726         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 727         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 728         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 729         XLogRecData hdr_rdt;
 730         pg_crc32        rdata_crc;
 731         uint32          len,
 732                                 write_len;
 733         unsigned        i;
 734         bool            updrqst;
 735         bool            doPageWrites;
 736         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 737         uint8           info_orig = info;
 738         static XLogRecord *rechdr;
 739
 740         if (rechdr == NULL)
 741         {
 742                 rechdr = malloc(SizeOfXLogRecord);
 743                 if (rechdr == NULL)
 744                         elog(ERROR, "out of memory");
 745                 MemSet(rechdr, 0, SizeOfXLogRecord);
 746         }
 747
 748         /* cross-check on whether we should be here or not */
 749         if (!XLogInsertAllowed())
 750                 elog(ERROR, "cannot make new WAL entries during recovery");
 751
 752         /* info's high bits are reserved for use by me */
 753         if (info & XLR_INFO_MASK)
 754                 elog(PANIC, "invalid xlog info mask %02X", info);
 755
 756         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 757
 758         /*
 759          * In bootstrap mode, we don't actually log anything but XLOG resources;
 760          * return a phony record pointer.
 761          */
 762         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 763         {
 764                 RecPtr = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 765                 return RecPtr;
 766         }
 767
 768         /*
 769          * Here we scan the rdata chain, to determine which buffers must be backed
 770          * up.
 771          *
 772          * We may have to loop back to here if a race condition is detected below.
 773          * We could prevent the race by doing all this work while holding the
 774          * insert lock, but it seems better to avoid doing CRC calculations while
 775          * holding the lock.
 776          *
 777          * We add entries for backup blocks to the chain, so that they don't need
 778          * any special treatment in the critical section where the chunks are
 779          * copied into the WAL buffers. Those entries have to be unlinked from the
 780          * chain if we have to loop back here.
 781          */
 782 begin:;
 783         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 784         {
 785                 dtbuf[i] = InvalidBuffer;
 786                 dtbuf_bkp[i] = false;
 787         }
 788
 789         /*
 790          * Decide if we need to do full-page writes in this XLOG record: true if
 791          * full_page_writes is on or we have a PITR request for it.  Since we
 792          * don't yet have the insert lock, fullPageWrites and forcePageWrites
 793          * could change under us, but we'll recheck them once we have the lock.
 794          */
 795         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 796
 797         len = 0;
 798         for (rdt = rdata;;)
 799         {
 800                 if (rdt->buffer == InvalidBuffer)
 801                 {
 802                         /* Simple data, just include it */
 803                         len += rdt->len;
 804                 }
 805                 else
 806                 {
 807                         /* Find info for buffer */
 808                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 809                         {
 810                                 if (rdt->buffer == dtbuf[i])
 811                                 {
 812                                         /* Buffer already referenced by earlier chain item */
 813                                         if (dtbuf_bkp[i])
 814                                         {
 815                                                 rdt->data = NULL;
 816                                                 rdt->len = 0;
 817                                         }
 818                                         else if (rdt->data)
 819                                                 len += rdt->len;
 820                                         break;
 821                                 }
 822                                 if (dtbuf[i] == InvalidBuffer)
 823                                 {
 824                                         /* OK, put it in this slot */
 825                                         dtbuf[i] = rdt->buffer;
 826                                         if (doPageWrites && XLogCheckBuffer(rdt, true,
 827                                                                                    &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 828                                         {
 829                                                 dtbuf_bkp[i] = true;
 830                                                 rdt->data = NULL;
 831                                                 rdt->len = 0;
 832                                         }
 833                                         else if (rdt->data)
 834                                                 len += rdt->len;
 835                                         break;
 836                                 }
 837                         }
 838                         if (i >= XLR_MAX_BKP_BLOCKS)
 839                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 840                                          XLR_MAX_BKP_BLOCKS);
 841                 }
 842                 /* Break out of loop when rdt points to last chain item */
 843                 if (rdt->next == NULL)
 844                         break;
 845                 rdt = rdt->next;
 846         }
 847
 848         /*
 849          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 850          * error checking in ReadRecord.  This means that all callers of
 851          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 852          * make an exception for XLOG SWITCH records because we don't want them to
 853          * ever cross a segment boundary.
 854          */
 855         if (len == 0 && !isLogSwitch)
 856                 elog(PANIC, "invalid xlog record length %u", len);
 857
 858         /*
 859          * Make additional rdata chain entries for the backup blocks, so that we
 860          * don't need to special-case them in the write loop.  This modifies the
 861          * original rdata chain, but we keep a pointer to the last regular entry,
 862          * rdt_lastnormal, so that we can undo this if we have to loop back to the
 863          * beginning.
 864          *
 865          * At the exit of this loop, write_len includes the backup block data.
 866          *
 867          * Also set the appropriate info bits to show which buffers were backed
 868          * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
 869          * value (ignoring InvalidBuffer) appearing in the rdata chain.
 870          */
 871         rdt_lastnormal = rdt;
 872         write_len = len;
 873         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 874         {
 875                 BkpBlock   *bkpb;
 876                 char       *page;
 877
 878                 if (!dtbuf_bkp[i])
 879                         continue;
 880
 881                 info |= XLR_BKP_BLOCK(i);
 882
 883                 bkpb = &(dtbuf_xlg[i]);
 884                 page = (char *) BufferGetBlock(dtbuf[i]);
 885
 886                 rdt->next = &(dtbuf_rdt1[i]);
 887                 rdt = rdt->next;
 888
 889                 rdt->data = (char *) bkpb;
 890                 rdt->len = sizeof(BkpBlock);
 891                 write_len += sizeof(BkpBlock);
 892
 893                 rdt->next = &(dtbuf_rdt2[i]);
 894                 rdt = rdt->next;
 895
 896                 if (bkpb->hole_length == 0)
 897                 {
 898                         rdt->data = page;
 899                         rdt->len = BLCKSZ;
 900                         write_len += BLCKSZ;
 901                         rdt->next = NULL;
 902                 }
 903                 else
 904                 {
 905                         /* must skip the hole */
 906                         rdt->data = page;
 907                         rdt->len = bkpb->hole_offset;
 908                         write_len += bkpb->hole_offset;
 909
 910                         rdt->next = &(dtbuf_rdt3[i]);
 911                         rdt = rdt->next;
 912
 913                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 914                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 915                         write_len += rdt->len;
 916                         rdt->next = NULL;
 917                 }
 918         }
 919
 920         /*
 921          * Calculate CRC of the data, including all the backup blocks
 922          *
 923          * Note that the record header isn't added into the CRC initially since we
 924          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
 925          * the whole record in the order: rdata, then backup blocks, then record
 926          * header.
 927          */
 928         INIT_CRC32(rdata_crc);
 929         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
 930                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 931
 932         /*
 933          * Construct record header (prev-link and CRC are filled in later), and
 934          * make that the first chunk in the chain.
 935          */
 936         rechdr->xl_xid = GetCurrentTransactionIdIfAny();
 937         rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
 938         rechdr->xl_len = len;           /* doesn't include backup blocks */
 939         rechdr->xl_info = info;
 940         rechdr->xl_rmid = rmid;
 941
 942         hdr_rdt.next = rdata;
 943         hdr_rdt.data = (char *) rechdr;
 944         hdr_rdt.len = SizeOfXLogRecord;
 945
 946         write_len += SizeOfXLogRecord;
 947
 948         START_CRIT_SECTION();
 949
 950         /* Now wait to get insert lock */
 951         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 952
 953         /*
 954          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 955          * back and recompute everything.  This can only happen just after a
 956          * checkpoint, so it's better to be slow in this case and fast otherwise.
 957          *
 958          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 959          * affect the contents of the XLOG record, so we'll update our local copy
 960          * but not force a recomputation.
 961          */
 962         if (RedoRecPtr != Insert->RedoRecPtr)
 963         {
 964                 Assert(RedoRecPtr < Insert->RedoRecPtr);
 965                 RedoRecPtr = Insert->RedoRecPtr;
 966
 967                 if (doPageWrites)
 968                 {
 969                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 970                         {
 971                                 if (dtbuf[i] == InvalidBuffer)
 972                                         continue;
 973                                 if (dtbuf_bkp[i] == false &&
 974                                         dtbuf_lsn[i] <= RedoRecPtr)
 975                                 {
 976                                         /*
 977                                          * Oops, this buffer now needs to be backed up, but we
 978                                          * didn't think so above.  Start over.
 979                                          */
 980                                         LWLockRelease(WALInsertLock);
 981                                         END_CRIT_SECTION();
 982                                         rdt_lastnormal->next = NULL;
 983                                         info = info_orig;
 984                                         goto begin;
 985                                 }
 986                         }
 987                 }
 988         }
 989
 990         /*
 991          * Also check to see if fullPageWrites or forcePageWrites was just turned
 992          * on; if we weren't already doing full-page writes then go back and
 993          * recompute. (If it was just turned off, we could recompute the record
 994          * without full pages, but we choose not to bother.)
 995          */
 996         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
 997         {
 998                 /* Oops, must redo it with full-page data. */
 999                 LWLockRelease(WALInsertLock);
1000                 END_CRIT_SECTION();
1001                 rdt_lastnormal->next = NULL;
1002                 info = info_orig;
1003                 goto begin;
1004         }
1005
1006         /*
1007          * If the current page is completely full, the record goes to the next
1008          * page, right after the page header.
1009          */
1010         updrqst = false;
1011         freespace = INSERT_FREESPACE(Insert);
1012         if (freespace == 0)
1013         {
1014                 updrqst = AdvanceXLInsertBuffer(false);
1015                 freespace = INSERT_FREESPACE(Insert);
1016         }
1017
1018         /* Compute record's XLOG location */
1019         curridx = Insert->curridx;
1020         INSERT_RECPTR(RecPtr, Insert, curridx);
1021
1022         /*
1023          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
1024          * segment, we need not insert it (and don't want to because we'd like
1025          * consecutive switch requests to be no-ops).  Instead, make sure
1026          * everything is written and flushed through the end of the prior segment,
1027          * and return the prior segment's end address.
1028          */
1029         if (isLogSwitch && (RecPtr % XLogSegSize) == SizeOfXLogLongPHD)
1030         {
1031                 /* We can release insert lock immediately */
1032                 LWLockRelease(WALInsertLock);
1033
1034                 RecPtr -= SizeOfXLogLongPHD;
1035
1036                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1037                 LogwrtResult = XLogCtl->LogwrtResult;
1038                 if (LogwrtResult.Flush < RecPtr)
1039                 {
1040                         XLogwrtRqst FlushRqst;
1041
1042                         FlushRqst.Write = RecPtr;
1043                         FlushRqst.Flush = RecPtr;
1044                         XLogWrite(FlushRqst, false, false);
1045                 }
1046                 LWLockRelease(WALWriteLock);
1047
1048                 END_CRIT_SECTION();
1049
1050                 /* wake up walsenders now that we've released heavily contended locks */
1051                 WalSndWakeupProcessRequests();
1052                 return RecPtr;
1053         }
1054
1055         /* Finish the record header */
1056         rechdr->xl_prev = Insert->PrevRecord;
1057
1058         /* Now we can finish computing the record's CRC */
1059         COMP_CRC32(rdata_crc, (char *) rechdr, offsetof(XLogRecord, xl_crc));
1060         FIN_CRC32(rdata_crc);
1061         rechdr->xl_crc = rdata_crc;
1062
1063 #ifdef WAL_DEBUG
1064         if (XLOG_DEBUG)
1065         {
1066                 StringInfoData buf;
1067
1068                 initStringInfo(&buf);
1069                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1070                                                  (uint32) (RecPtr >> 32), (uint32) RecPtr);
1071                 xlog_outrec(&buf, rechdr);
1072                 if (rdata->data != NULL)
1073                 {
1074                         appendStringInfo(&buf, " - ");
1075                         RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
1076                 }
1077                 elog(LOG, "%s", buf.data);
1078                 pfree(buf.data);
1079         }
1080 #endif
1081
1082         /* Record begin of record in appropriate places */
1083         ProcLastRecPtr = RecPtr;
1084         Insert->PrevRecord = RecPtr;
1085
1086         /*
1087          * Append the data, including backup blocks if any
1088          */
1089         rdata = &hdr_rdt;
1090         while (write_len)
1091         {
1092                 while (rdata->data == NULL)
1093                         rdata = rdata->next;
1094
1095                 if (freespace > 0)
1096                 {
1097                         if (rdata->len > freespace)
1098                         {
1099                                 memcpy(Insert->currpos, rdata->data, freespace);
1100                                 rdata->data += freespace;
1101                                 rdata->len -= freespace;
1102                                 write_len -= freespace;
1103                         }
1104                         else
1105                         {
1106                                 memcpy(Insert->currpos, rdata->data, rdata->len);
1107                                 freespace -= rdata->len;
1108                                 write_len -= rdata->len;
1109                                 Insert->currpos += rdata->len;
1110                                 rdata = rdata->next;
1111                                 continue;
1112                         }
1113                 }
1114
1115                 /* Use next buffer */
1116                 updrqst = AdvanceXLInsertBuffer(false);
1117                 curridx = Insert->curridx;
1118                 /* Mark page header to indicate this record continues on the page */
1119                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1120                 Insert->currpage->xlp_rem_len = write_len;
1121                 freespace = INSERT_FREESPACE(Insert);
1122         }
1123
1124         /* Ensure next record will be properly aligned */
1125         Insert->currpos = (char *) Insert->currpage +
1126                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
1127         freespace = INSERT_FREESPACE(Insert);
1128
1129         /*
1130          * The recptr I return is the beginning of the *next* record. This will be
1131          * stored as LSN for changed data pages...
1132          */
1133         INSERT_RECPTR(RecPtr, Insert, curridx);
1134
1135         /*
1136          * If the record is an XLOG_SWITCH, we must now write and flush all the
1137          * existing data, and then forcibly advance to the start of the next
1138          * segment.  It's not good to do this I/O while holding the insert lock,
1139          * but there seems too much risk of confusion if we try to release the
1140          * lock sooner.  Fortunately xlog switch needn't be a high-performance
1141          * operation anyway...
1142          */
1143         if (isLogSwitch)
1144         {
1145                 XLogwrtRqst FlushRqst;
1146                 XLogRecPtr      OldSegEnd;
1147
1148                 TRACE_POSTGRESQL_XLOG_SWITCH();
1149
1150                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1151
1152                 /*
1153                  * Flush through the end of the page containing XLOG_SWITCH, and
1154                  * perform end-of-segment actions (eg, notifying archiver).
1155                  */
1156                 WriteRqst = XLogCtl->xlblocks[curridx];
1157                 FlushRqst.Write = WriteRqst;
1158                 FlushRqst.Flush = WriteRqst;
1159                 XLogWrite(FlushRqst, false, true);
1160
1161                 /* Set up the next buffer as first page of next segment */
1162                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
1163                 (void) AdvanceXLInsertBuffer(true);
1164
1165                 /* There should be no unwritten data */
1166                 curridx = Insert->curridx;
1167                 Assert(curridx == XLogCtl->Write.curridx);
1168
1169                 /* Compute end address of old segment */
1170                 OldSegEnd = XLogCtl->xlblocks[curridx];
1171                 OldSegEnd -= XLOG_BLCKSZ;
1172
1173                 /* Make it look like we've written and synced all of old segment */
1174                 LogwrtResult.Write = OldSegEnd;
1175                 LogwrtResult.Flush = OldSegEnd;
1176
1177                 /*
1178                  * Update shared-memory status --- this code should match XLogWrite
1179                  */
1180                 {
1181                         /* use volatile pointer to prevent code rearrangement */
1182                         volatile XLogCtlData *xlogctl = XLogCtl;
1183
1184                         SpinLockAcquire(&xlogctl->info_lck);
1185                         xlogctl->LogwrtResult = LogwrtResult;
1186                         if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
1187                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1188                         if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
1189                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1190                         SpinLockRelease(&xlogctl->info_lck);
1191                 }
1192
1193                 LWLockRelease(WALWriteLock);
1194
1195                 updrqst = false;                /* done already */
1196         }
1197         else
1198         {
1199                 /* normal case, ie not xlog switch */
1200
1201                 /* Need to update shared LogwrtRqst if some block was filled up */
1202                 if (freespace == 0)
1203                 {
1204                         /* curridx is filled and available for writing out */
1205                         updrqst = true;
1206                 }
1207                 else
1208                 {
1209                         /* if updrqst already set, write through end of previous buf */
1210                         curridx = PrevBufIdx(curridx);
1211                 }
1212                 WriteRqst = XLogCtl->xlblocks[curridx];
1213         }
1214
1215         LWLockRelease(WALInsertLock);
1216
1217         if (updrqst)
1218         {
1219                 /* use volatile pointer to prevent code rearrangement */
1220                 volatile XLogCtlData *xlogctl = XLogCtl;
1221
1222                 SpinLockAcquire(&xlogctl->info_lck);
1223                 /* advance global request to include new block(s) */
1224                 if (xlogctl->LogwrtRqst.Write < WriteRqst)
1225                         xlogctl->LogwrtRqst.Write = WriteRqst;
1226                 /* update local result copy while I have the chance */
1227                 LogwrtResult = xlogctl->LogwrtResult;
1228                 SpinLockRelease(&xlogctl->info_lck);
1229         }
1230
1231         XactLastRecEnd = RecPtr;
1232
1233         END_CRIT_SECTION();
1234
1235         /* wake up walsenders now that we've released heavily contended locks */
1236         WalSndWakeupProcessRequests();
1237
1238         return RecPtr;
1239 }
1240
1241 /*
1242  * Determine whether the buffer referenced by an XLogRecData item has to
1243  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1244  * save the buffer's LSN at *lsn.
1245  */
1246 static bool
1247 XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
1248                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1249 {
1250         Page            page;
1251
1252         page = BufferGetPage(rdata->buffer);
1253
1254         /*
1255          * We assume page LSN is first data on *every* page that can be passed to
1256          * XLogInsert, whether it has the standard page layout or not. We don't
1257          * need to take the buffer header lock for PageGetLSN if we hold an
1258          * exclusive lock on the page and/or the relation.
1259          */
1260         if (holdsExclusiveLock)
1261                 *lsn = PageGetLSN(page);
1262         else
1263                 *lsn = BufferGetLSNAtomic(rdata->buffer);
1264
1265         if (*lsn <= RedoRecPtr)
1266         {
1267                 /*
1268                  * The page needs to be backed up, so set up *bkpb
1269                  */
1270                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1271
1272                 if (rdata->buffer_std)
1273                 {
1274                         /* Assume we can omit data between pd_lower and pd_upper */
1275                         uint16          lower = ((PageHeader) page)->pd_lower;
1276                         uint16          upper = ((PageHeader) page)->pd_upper;
1277
1278                         if (lower >= SizeOfPageHeaderData &&
1279                                 upper > lower &&
1280                                 upper <= BLCKSZ)
1281                         {
1282                                 bkpb->hole_offset = lower;
1283                                 bkpb->hole_length = upper - lower;
1284                         }
1285                         else
1286                         {
1287                                 /* No "hole" to compress out */
1288                                 bkpb->hole_offset = 0;
1289                                 bkpb->hole_length = 0;
1290                         }
1291                 }
1292                 else
1293                 {
1294                         /* Not a standard page header, don't try to eliminate "hole" */
1295                         bkpb->hole_offset = 0;
1296                         bkpb->hole_length = 0;
1297                 }
1298
1299                 return true;                    /* buffer requires backup */
1300         }
1301
1302         return false;                           /* buffer does not need to be backed up */
1303 }
1304
1305 /*
1306  * Advance the Insert state to the next buffer page, writing out the next
1307  * buffer if it still contains unwritten data.
1308  *
1309  * If new_segment is TRUE then we set up the next buffer page as the first
1310  * page of the next xlog segment file, possibly but not usually the next
1311  * consecutive file page.
1312  *
1313  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1314  * just-filled page.  If we can do this for free (without an extra lock),
1315  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1316  * request update still needs to be done, FALSE if we did it internally.
1317  *
1318  * Must be called with WALInsertLock held.
1319  */
1320 static bool
1321 AdvanceXLInsertBuffer(bool new_segment)
1322 {
1323         XLogCtlInsert *Insert = &XLogCtl->Insert;
1324         int                     nextidx = NextBufIdx(Insert->curridx);
1325         bool            update_needed = true;
1326         XLogRecPtr      OldPageRqstPtr;
1327         XLogwrtRqst WriteRqst;
1328         XLogRecPtr      NewPageEndPtr;
1329         XLogRecPtr      NewPageBeginPtr;
1330         XLogPageHeader NewPage;
1331
1332         /*
1333          * Get ending-offset of the buffer page we need to replace (this may be
1334          * zero if the buffer hasn't been used yet).  Fall through if it's already
1335          * written out.
1336          */
1337         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1338         if (LogwrtResult.Write < OldPageRqstPtr)
1339         {
1340                 /* nope, got work to do... */
1341                 XLogRecPtr      FinishedPageRqstPtr;
1342
1343                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1344
1345                 /* Before waiting, get info_lck and update LogwrtResult */
1346                 {
1347                         /* use volatile pointer to prevent code rearrangement */
1348                         volatile XLogCtlData *xlogctl = XLogCtl;
1349
1350                         SpinLockAcquire(&xlogctl->info_lck);
1351                         if (xlogctl->LogwrtRqst.Write < FinishedPageRqstPtr)
1352                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1353                         LogwrtResult = xlogctl->LogwrtResult;
1354                         SpinLockRelease(&xlogctl->info_lck);
1355                 }
1356
1357                 update_needed = false;  /* Did the shared-request update */
1358
1359                 /*
1360                  * Now that we have an up-to-date LogwrtResult value, see if we still
1361                  * need to write it or if someone else already did.
1362                  */
1363                 if (LogwrtResult.Write < OldPageRqstPtr)
1364                 {
1365                         /* Must acquire write lock */
1366                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1367                         LogwrtResult = XLogCtl->LogwrtResult;
1368                         if (LogwrtResult.Write >= OldPageRqstPtr)
1369                         {
1370                                 /* OK, someone wrote it already */
1371                                 LWLockRelease(WALWriteLock);
1372                         }
1373                         else
1374                         {
1375                                 /*
1376                                  * Have to write buffers while holding insert lock. This is
1377                                  * not good, so only write as much as we absolutely must.
1378                                  */
1379                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1380                                 WriteRqst.Write = OldPageRqstPtr;
1381                                 WriteRqst.Flush = 0;
1382                                 XLogWrite(WriteRqst, false, false);
1383                                 LWLockRelease(WALWriteLock);
1384                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1385                         }
1386                 }
1387         }
1388
1389         /*
1390          * Now the next buffer slot is free and we can set it up to be the next
1391          * output page.
1392          */
1393         NewPageBeginPtr = XLogCtl->xlblocks[Insert->curridx];
1394
1395         if (new_segment)
1396         {
1397                 /* force it to a segment start point */
1398                 if (NewPageBeginPtr % XLogSegSize != 0)
1399                         NewPageBeginPtr += XLogSegSize - NewPageBeginPtr % XLogSegSize;
1400         }
1401
1402         NewPageEndPtr = NewPageBeginPtr;
1403         NewPageEndPtr += XLOG_BLCKSZ;
1404         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1405         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1406
1407         Insert->curridx = nextidx;
1408         Insert->currpage = NewPage;
1409
1410         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1411
1412         /*
1413          * Be sure to re-zero the buffer so that bytes beyond what we've written
1414          * will look like zeroes and not valid XLOG records...
1415          */
1416         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1417
1418         /*
1419          * Fill the new page's header
1420          */
1421         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1422
1423         /* NewPage->xlp_info = 0; */    /* done by memset */
1424         NewPage   ->xlp_tli = ThisTimeLineID;
1425         NewPage   ->xlp_pageaddr = NewPageBeginPtr;
1426
1427         /*
1428          * If online backup is not in progress, mark the header to indicate that
1429          * WAL records beginning in this page have removable backup blocks.  This
1430          * allows the WAL archiver to know whether it is safe to compress archived
1431          * WAL data by transforming full-block records into the non-full-block
1432          * format.      It is sufficient to record this at the page level because we
1433          * force a page switch (in fact a segment switch) when starting a backup,
1434          * so the flag will be off before any records can be written during the
1435          * backup.      At the end of a backup, the last page will be marked as all
1436          * unsafe when perhaps only part is unsafe, but at worst the archiver
1437          * would miss the opportunity to compress a few records.
1438          */
1439         if (!Insert->forcePageWrites)
1440                 NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
1441
1442         /*
1443          * If first page of an XLOG segment file, make it a long header.
1444          */
1445         if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
1446         {
1447                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1448
1449                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1450                 NewLongPage->xlp_seg_size = XLogSegSize;
1451                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1452                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1453
1454                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1455         }
1456
1457         return update_needed;
1458 }
1459
1460 /*
1461  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1462  *
1463  * new_segno indicates a log file that has just been filled up (or read
1464  * during recovery). We measure the distance from RedoRecPtr to new_segno
1465  * and see if that exceeds CheckPointSegments.
1466  *
1467  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1468  */
1469 static bool
1470 XLogCheckpointNeeded(XLogSegNo new_segno)
1471 {
1472         XLogSegNo       old_segno;
1473
1474         XLByteToSeg(RedoRecPtr, old_segno);
1475
1476         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
1477                 return true;
1478         return false;
1479 }
1480
1481 /*
1482  * Write and/or fsync the log at least as far as WriteRqst indicates.
1483  *
1484  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1485  * may stop at any convenient boundary (such as a cache or logfile boundary).
1486  * This option allows us to avoid uselessly issuing multiple writes when a
1487  * single one would do.
1488  *
1489  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1490  * perform end-of-segment actions after writing the last page, even if
1491  * it's not physically the end of its segment.  (NB: this will work properly
1492  * only if caller specifies WriteRqst == page-end and flexible == false,
1493  * and there is some data to write.)
1494  *
1495  * Must be called with WALWriteLock held.
1496  */
1497 static void
1498 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1499 {
1500         XLogCtlWrite *Write = &XLogCtl->Write;
1501         bool            ispartialpage;
1502         bool            last_iteration;
1503         bool            finishing_seg;
1504         bool            use_existent;
1505         int                     curridx;
1506         int                     npages;
1507         int                     startidx;
1508         uint32          startoffset;
1509
1510         /* We should always be inside a critical section here */
1511         Assert(CritSectionCount > 0);
1512
1513         /*
1514          * Update local LogwrtResult (caller probably did this already, but...)
1515          */
1516         LogwrtResult = XLogCtl->LogwrtResult;
1517
1518         /*
1519          * Since successive pages in the xlog cache are consecutively allocated,
1520          * we can usually gather multiple pages together and issue just one
1521          * write() call.  npages is the number of pages we have determined can be
1522          * written together; startidx is the cache block index of the first one,
1523          * and startoffset is the file offset at which it should go. The latter
1524          * two variables are only valid when npages > 0, but we must initialize
1525          * all of them to keep the compiler quiet.
1526          */
1527         npages = 0;
1528         startidx = 0;
1529         startoffset = 0;
1530
1531         /*
1532          * Within the loop, curridx is the cache block index of the page to
1533          * consider writing.  We advance Write->curridx only after successfully
1534          * writing pages.  (Right now, this refinement is useless since we are
1535          * going to PANIC if any error occurs anyway; but someday it may come in
1536          * useful.)
1537          */
1538         curridx = Write->curridx;
1539
1540         while (LogwrtResult.Write < WriteRqst.Write)
1541         {
1542                 /*
1543                  * Make sure we're not ahead of the insert process.  This could happen
1544                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1545                  * last page that's been initialized by AdvanceXLInsertBuffer.
1546                  */
1547                 if (LogwrtResult.Write >= XLogCtl->xlblocks[curridx])
1548                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1549                                  (uint32) (LogwrtResult.Write >> 32),
1550                                  (uint32) LogwrtResult.Write,
1551                                  (uint32) (XLogCtl->xlblocks[curridx] >> 32),
1552                                  (uint32) XLogCtl->xlblocks[curridx]);
1553
1554                 /* Advance LogwrtResult.Write to end of current buffer page */
1555                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1556                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
1557
1558                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
1559                 {
1560                         /*
1561                          * Switch to new logfile segment.  We cannot have any pending
1562                          * pages here (since we dump what we have at segment end).
1563                          */
1564                         Assert(npages == 0);
1565                         if (openLogFile >= 0)
1566                                 XLogFileClose();
1567                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
1568
1569                         /* create/use new log file */
1570                         use_existent = true;
1571                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
1572                         openLogOff = 0;
1573                 }
1574
1575                 /* Make sure we have the current logfile open */
1576                 if (openLogFile < 0)
1577                 {
1578                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
1579                         openLogFile = XLogFileOpen(openLogSegNo);
1580                         openLogOff = 0;
1581                 }
1582
1583                 /* Add current page to the set of pending pages-to-dump */
1584                 if (npages == 0)
1585                 {
1586                         /* first of group */
1587                         startidx = curridx;
1588                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
1589                 }
1590                 npages++;
1591
1592                 /*
1593                  * Dump the set if this will be the last loop iteration, or if we are
1594                  * at the last page of the cache area (since the next page won't be
1595                  * contiguous in memory), or if we are at the end of the logfile
1596                  * segment.
1597                  */
1598                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
1599
1600                 finishing_seg = !ispartialpage &&
1601                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1602
1603                 if (last_iteration ||
1604                         curridx == XLogCtl->XLogCacheBlck ||
1605                         finishing_seg)
1606                 {
1607                         char       *from;
1608                         Size            nbytes;
1609
1610                         /* Need to seek in the file? */
1611                         if (openLogOff != startoffset)
1612                         {
1613                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1614                                         ereport(PANIC,
1615                                                         (errcode_for_file_access(),
1616                                          errmsg("could not seek in log file %s to offset %u: %m",
1617                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
1618                                                         startoffset)));
1619                                 openLogOff = startoffset;
1620                         }
1621
1622                         /* OK to write the page(s) */
1623                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1624                         nbytes = npages * (Size) XLOG_BLCKSZ;
1625                         errno = 0;
1626                         if (write(openLogFile, from, nbytes) != nbytes)
1627                         {
1628                                 /* if write didn't set errno, assume no disk space */
1629                                 if (errno == 0)
1630                                         errno = ENOSPC;
1631                                 ereport(PANIC,
1632                                                 (errcode_for_file_access(),
1633                                                  errmsg("could not write to log file %s "
1634                                                                 "at offset %u, length %lu: %m",
1635                                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo),
1636                                                                 openLogOff, (unsigned long) nbytes)));
1637                         }
1638
1639                         /* Update state for write */
1640                         openLogOff += nbytes;
1641                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1642                         npages = 0;
1643
1644                         /*
1645                          * If we just wrote the whole last page of a logfile segment,
1646                          * fsync the segment immediately.  This avoids having to go back
1647                          * and re-open prior segments when an fsync request comes along
1648                          * later. Doing it here ensures that one and only one backend will
1649                          * perform this fsync.
1650                          *
1651                          * We also do this if this is the last page written for an xlog
1652                          * switch.
1653                          *
1654                          * This is also the right place to notify the Archiver that the
1655                          * segment is ready to copy to archival storage, and to update the
1656                          * timer for archive_timeout, and to signal for a checkpoint if
1657                          * too many logfile segments have been used since the last
1658                          * checkpoint.
1659                          */
1660                         if (finishing_seg || (xlog_switch && last_iteration))
1661                         {
1662                                 issue_xlog_fsync(openLogFile, openLogSegNo);
1663
1664                                 /* signal that we need to wakeup walsenders later */
1665                                 WalSndWakeupRequest();
1666
1667                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1668
1669                                 if (XLogArchivingActive())
1670                                         XLogArchiveNotifySeg(openLogSegNo);
1671
1672                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1673
1674                                 /*
1675                                  * Request a checkpoint if we've consumed too much xlog since
1676                                  * the last one.  For speed, we first check using the local
1677                                  * copy of RedoRecPtr, which might be out of date; if it looks
1678                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
1679                                  * recheck.
1680                                  */
1681                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
1682                                 {
1683                                         (void) GetRedoRecPtr();
1684                                         if (XLogCheckpointNeeded(openLogSegNo))
1685                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1686                                 }
1687                         }
1688                 }
1689
1690                 if (ispartialpage)
1691                 {
1692                         /* Only asked to write a partial page */
1693                         LogwrtResult.Write = WriteRqst.Write;
1694                         break;
1695                 }
1696                 curridx = NextBufIdx(curridx);
1697
1698                 /* If flexible, break out of loop as soon as we wrote something */
1699                 if (flexible && npages == 0)
1700                         break;
1701         }
1702
1703         Assert(npages == 0);
1704         Assert(curridx == Write->curridx);
1705
1706         /*
1707          * If asked to flush, do so
1708          */
1709         if (LogwrtResult.Flush < WriteRqst.Flush &&
1710                 LogwrtResult.Flush < LogwrtResult.Write)
1711
1712         {
1713                 /*
1714                  * Could get here without iterating above loop, in which case we might
1715                  * have no open file or the wrong one.  However, we do not need to
1716                  * fsync more than one file.
1717                  */
1718                 if (sync_method != SYNC_METHOD_OPEN &&
1719                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1720                 {
1721                         if (openLogFile >= 0 &&
1722                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
1723                                 XLogFileClose();
1724                         if (openLogFile < 0)
1725                         {
1726                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
1727                                 openLogFile = XLogFileOpen(openLogSegNo);
1728                                 openLogOff = 0;
1729                         }
1730
1731                         issue_xlog_fsync(openLogFile, openLogSegNo);
1732                 }
1733
1734                 /* signal that we need to wakeup walsenders later */
1735                 WalSndWakeupRequest();
1736
1737                 LogwrtResult.Flush = LogwrtResult.Write;
1738         }
1739
1740         /*
1741          * Update shared-memory status
1742          *
1743          * We make sure that the shared 'request' values do not fall behind the
1744          * 'result' values.  This is not absolutely essential, but it saves some
1745          * code in a couple of places.
1746          */
1747         {
1748                 /* use volatile pointer to prevent code rearrangement */
1749                 volatile XLogCtlData *xlogctl = XLogCtl;
1750
1751                 SpinLockAcquire(&xlogctl->info_lck);
1752                 xlogctl->LogwrtResult = LogwrtResult;
1753                 if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
1754                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1755                 if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
1756                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1757                 SpinLockRelease(&xlogctl->info_lck);
1758         }
1759 }
1760
1761 /*
1762  * Record the LSN for an asynchronous transaction commit/abort
1763  * and nudge the WALWriter if there is work for it to do.
1764  * (This should not be called for synchronous commits.)
1765  */
1766 void
1767 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
1768 {
1769         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
1770         bool            sleeping;
1771
1772         /* use volatile pointer to prevent code rearrangement */
1773         volatile XLogCtlData *xlogctl = XLogCtl;
1774
1775         SpinLockAcquire(&xlogctl->info_lck);
1776         LogwrtResult = xlogctl->LogwrtResult;
1777         sleeping = xlogctl->WalWriterSleeping;
1778         if (xlogctl->asyncXactLSN < asyncXactLSN)
1779                 xlogctl->asyncXactLSN = asyncXactLSN;
1780         SpinLockRelease(&xlogctl->info_lck);
1781
1782         /*
1783          * If the WALWriter is sleeping, we should kick it to make it come out of
1784          * low-power mode.      Otherwise, determine whether there's a full page of
1785          * WAL available to write.
1786          */
1787         if (!sleeping)
1788         {
1789                 /* back off to last completed page boundary */
1790                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
1791
1792                 /* if we have already flushed that far, we're done */
1793                 if (WriteRqstPtr <= LogwrtResult.Flush)
1794                         return;
1795         }
1796
1797         /*
1798          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
1799          * to come out of low-power mode so that this async commit will reach disk
1800          * within the expected amount of time.
1801          */
1802         if (ProcGlobal->walwriterLatch)
1803                 SetLatch(ProcGlobal->walwriterLatch);
1804 }
1805
1806 /*
1807  * Advance minRecoveryPoint in control file.
1808  *
1809  * If we crash during recovery, we must reach this point again before the
1810  * database is consistent.
1811  *
1812  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1813  * is only updated if it's not already greater than or equal to 'lsn'.
1814  */
1815 static void
1816 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
1817 {
1818         /* Quick check using our local copy of the variable */
1819         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
1820                 return;
1821
1822         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1823
1824         /* update local copy */
1825         minRecoveryPoint = ControlFile->minRecoveryPoint;
1826         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1827
1828         /*
1829          * An invalid minRecoveryPoint means that we need to recover all the WAL,
1830          * i.e., we're doing crash recovery.  We never modify the control file's
1831          * value in that case, so we can short-circuit future checks here too.
1832          */
1833         if (minRecoveryPoint == 0)
1834                 updateMinRecoveryPoint = false;
1835         else if (force || minRecoveryPoint < lsn)
1836         {
1837                 /* use volatile pointer to prevent code rearrangement */
1838                 volatile XLogCtlData *xlogctl = XLogCtl;
1839                 XLogRecPtr      newMinRecoveryPoint;
1840                 TimeLineID      newMinRecoveryPointTLI;
1841
1842                 /*
1843                  * To avoid having to update the control file too often, we update it
1844                  * all the way to the last record being replayed, even though 'lsn'
1845                  * would suffice for correctness.  This also allows the 'force' case
1846                  * to not need a valid 'lsn' value.
1847                  *
1848                  * Another important reason for doing it this way is that the passed
1849                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
1850                  * the caller got it from a corrupted heap page.  Accepting such a
1851                  * value as the min recovery point would prevent us from coming up at
1852                  * all.  Instead, we just log a warning and continue with recovery.
1853                  * (See also the comments about corrupt LSNs in XLogFlush.)
1854                  */
1855                 SpinLockAcquire(&xlogctl->info_lck);
1856                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
1857                 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
1858                 SpinLockRelease(&xlogctl->info_lck);
1859
1860                 if (!force && newMinRecoveryPoint < lsn)
1861                         elog(WARNING,
1862                            "xlog min recovery request %X/%X is past current point %X/%X",
1863                                  (uint32) (lsn >> 32), (uint32) lsn,
1864                                  (uint32) (newMinRecoveryPoint >> 32),
1865                                  (uint32) newMinRecoveryPoint);
1866
1867                 /* update control file */
1868                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
1869                 {
1870                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
1871                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
1872                         UpdateControlFile();
1873                         minRecoveryPoint = newMinRecoveryPoint;
1874                         minRecoveryPointTLI = newMinRecoveryPointTLI;
1875
1876                         ereport(DEBUG2,
1877                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
1878                                                 (uint32) (minRecoveryPoint >> 32),
1879                                                 (uint32) minRecoveryPoint,
1880                                                 newMinRecoveryPointTLI)));
1881                 }
1882         }
1883         LWLockRelease(ControlFileLock);
1884 }
1885
1886 /*
1887  * Ensure that all XLOG data through the given position is flushed to disk.
1888  *
1889  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
1890  * already held, and we try to avoid acquiring it if possible.
1891  */
1892 void
1893 XLogFlush(XLogRecPtr record)
1894 {
1895         XLogRecPtr      WriteRqstPtr;
1896         XLogwrtRqst WriteRqst;
1897
1898         /*
1899          * During REDO, we are reading not writing WAL.  Therefore, instead of
1900          * trying to flush the WAL, we should update minRecoveryPoint instead. We
1901          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
1902          * to act this way too, and because when it tries to write the
1903          * end-of-recovery checkpoint, it should indeed flush.
1904          */
1905         if (!XLogInsertAllowed())
1906         {
1907                 UpdateMinRecoveryPoint(record, false);
1908                 return;
1909         }
1910
1911         /* Quick exit if already known flushed */
1912         if (record <= LogwrtResult.Flush)
1913                 return;
1914
1915 #ifdef WAL_DEBUG
1916         if (XLOG_DEBUG)
1917                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1918                          (uint32) (record >> 32), (uint32) record,
1919                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
1920                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
1921 #endif
1922
1923         START_CRIT_SECTION();
1924
1925         /*
1926          * Since fsync is usually a horribly expensive operation, we try to
1927          * piggyback as much data as we can on each fsync: if we see any more data
1928          * entered into the xlog buffer, we'll write and fsync that too, so that
1929          * the final value of LogwrtResult.Flush is as large as possible. This
1930          * gives us some chance of avoiding another fsync immediately after.
1931          */
1932
1933         /* initialize to given target; may increase below */
1934         WriteRqstPtr = record;
1935
1936         /*
1937          * Now wait until we get the write lock, or someone else does the flush
1938          * for us.
1939          */
1940         for (;;)
1941         {
1942                 /* use volatile pointer to prevent code rearrangement */
1943                 volatile XLogCtlData *xlogctl = XLogCtl;
1944
1945                 /* read LogwrtResult and update local state */
1946                 SpinLockAcquire(&xlogctl->info_lck);
1947                 if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
1948                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1949                 LogwrtResult = xlogctl->LogwrtResult;
1950                 SpinLockRelease(&xlogctl->info_lck);
1951
1952                 /* done already? */
1953                 if (record <= LogwrtResult.Flush)
1954                         break;
1955
1956                 /*
1957                  * Try to get the write lock. If we can't get it immediately, wait
1958                  * until it's released, and recheck if we still need to do the flush
1959                  * or if the backend that held the lock did it for us already. This
1960                  * helps to maintain a good rate of group committing when the system
1961                  * is bottlenecked by the speed of fsyncing.
1962                  */
1963                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
1964                 {
1965                         /*
1966                          * The lock is now free, but we didn't acquire it yet. Before we
1967                          * do, loop back to check if someone else flushed the record for
1968                          * us already.
1969                          */
1970                         continue;
1971                 }
1972
1973                 /* Got the lock; recheck whether request is satisfied */
1974                 LogwrtResult = XLogCtl->LogwrtResult;
1975                 if (record <= LogwrtResult.Flush)
1976                 {
1977                         LWLockRelease(WALWriteLock);
1978                         break;
1979                 }
1980
1981                 /*
1982                  * Sleep before flush! By adding a delay here, we may give further
1983                  * backends the opportunity to join the backlog of group commit
1984                  * followers; this can significantly improve transaction throughput,
1985                  * at the risk of increasing transaction latency.
1986                  *
1987                  * We do not sleep if enableFsync is not turned on, nor if there are
1988                  * fewer than CommitSiblings other backends with active transactions.
1989                  */
1990                 if (CommitDelay > 0 && enableFsync &&
1991                         MinimumActiveBackends(CommitSiblings))
1992                         pg_usleep(CommitDelay);
1993
1994                 /* try to write/flush later additions to XLOG as well */
1995                 if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
1996                 {
1997                         XLogCtlInsert *Insert = &XLogCtl->Insert;
1998                         uint32          freespace = INSERT_FREESPACE(Insert);
1999
2000                         if (freespace == 0) /* buffer is full */
2001                                 WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2002                         else
2003                         {
2004                                 WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2005                                 WriteRqstPtr -= freespace;
2006                         }
2007                         LWLockRelease(WALInsertLock);
2008                         WriteRqst.Write = WriteRqstPtr;
2009                         WriteRqst.Flush = WriteRqstPtr;
2010                 }
2011                 else
2012                 {
2013                         WriteRqst.Write = WriteRqstPtr;
2014                         WriteRqst.Flush = record;
2015                 }
2016                 XLogWrite(WriteRqst, false, false);
2017
2018                 LWLockRelease(WALWriteLock);
2019                 /* done */
2020                 break;
2021         }
2022
2023         END_CRIT_SECTION();
2024
2025         /* wake up walsenders now that we've released heavily contended locks */
2026         WalSndWakeupProcessRequests();
2027
2028         /*
2029          * If we still haven't flushed to the request point then we have a
2030          * problem; most likely, the requested flush point is past end of XLOG.
2031          * This has been seen to occur when a disk page has a corrupted LSN.
2032          *
2033          * Formerly we treated this as a PANIC condition, but that hurts the
2034          * system's robustness rather than helping it: we do not want to take down
2035          * the whole system due to corruption on one data page.  In particular, if
2036          * the bad page is encountered again during recovery then we would be
2037          * unable to restart the database at all!  (This scenario actually
2038          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2039          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2040          * the only time we can reach here during recovery is while flushing the
2041          * end-of-recovery checkpoint record, and we don't expect that to have a
2042          * bad LSN.
2043          *
2044          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2045          * since xact.c calls this routine inside a critical section.  However,
2046          * calls from bufmgr.c are not within critical sections and so we will not
2047          * force a restart for a bad LSN on a data page.
2048          */
2049         if (LogwrtResult.Flush < record)
2050                 elog(ERROR,
2051                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2052                          (uint32) (record >> 32), (uint32) record,
2053                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2054 }
2055
2056 /*
2057  * Flush xlog, but without specifying exactly where to flush to.
2058  *
2059  * We normally flush only completed blocks; but if there is nothing to do on
2060  * that basis, we check for unflushed async commits in the current incomplete
2061  * block, and flush through the latest one of those.  Thus, if async commits
2062  * are not being used, we will flush complete blocks only.      We can guarantee
2063  * that async commits reach disk after at most three cycles; normally only
2064  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
2065  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
2066  * difference only with very high load or long wal_writer_delay, but imposes
2067  * one extra cycle for the worst case for async commits.)
2068  *
2069  * This routine is invoked periodically by the background walwriter process.
2070  *
2071  * Returns TRUE if we flushed anything.
2072  */
2073 bool
2074 XLogBackgroundFlush(void)
2075 {
2076         XLogRecPtr      WriteRqstPtr;
2077         bool            flexible = true;
2078         bool            wrote_something = false;
2079
2080         /* XLOG doesn't need flushing during recovery */
2081         if (RecoveryInProgress())
2082                 return false;
2083
2084         /* read LogwrtResult and update local state */
2085         {
2086                 /* use volatile pointer to prevent code rearrangement */
2087                 volatile XLogCtlData *xlogctl = XLogCtl;
2088
2089                 SpinLockAcquire(&xlogctl->info_lck);
2090                 LogwrtResult = xlogctl->LogwrtResult;
2091                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2092                 SpinLockRelease(&xlogctl->info_lck);
2093         }
2094
2095         /* back off to last completed page boundary */
2096         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2097
2098         /* if we have already flushed that far, consider async commit records */
2099         if (WriteRqstPtr <= LogwrtResult.Flush)
2100         {
2101                 /* use volatile pointer to prevent code rearrangement */
2102                 volatile XLogCtlData *xlogctl = XLogCtl;
2103
2104                 SpinLockAcquire(&xlogctl->info_lck);
2105                 WriteRqstPtr = xlogctl->asyncXactLSN;
2106                 SpinLockRelease(&xlogctl->info_lck);
2107                 flexible = false;               /* ensure it all gets written */
2108         }
2109
2110         /*
2111          * If already known flushed, we're done. Just need to check if we are
2112          * holding an open file handle to a logfile that's no longer in use,
2113          * preventing the file from being deleted.
2114          */
2115         if (WriteRqstPtr <= LogwrtResult.Flush)
2116         {
2117                 if (openLogFile >= 0)
2118                 {
2119                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2120                         {
2121                                 XLogFileClose();
2122                         }
2123                 }
2124                 return false;
2125         }
2126
2127 #ifdef WAL_DEBUG
2128         if (XLOG_DEBUG)
2129                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2130                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
2131                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2132                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2133 #endif
2134
2135         START_CRIT_SECTION();
2136
2137         /* now wait for the write lock */
2138         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2139         LogwrtResult = XLogCtl->LogwrtResult;
2140         if (WriteRqstPtr > LogwrtResult.Flush)
2141         {
2142                 XLogwrtRqst WriteRqst;
2143
2144                 WriteRqst.Write = WriteRqstPtr;
2145                 WriteRqst.Flush = WriteRqstPtr;
2146                 XLogWrite(WriteRqst, flexible, false);
2147                 wrote_something = true;
2148         }
2149         LWLockRelease(WALWriteLock);
2150
2151         END_CRIT_SECTION();
2152
2153         /* wake up walsenders now that we've released heavily contended locks */
2154         WalSndWakeupProcessRequests();
2155
2156         return wrote_something;
2157 }
2158
2159 /*
2160  * Test whether XLOG data has been flushed up to (at least) the given position.
2161  *
2162  * Returns true if a flush is still needed.  (It may be that someone else
2163  * is already in process of flushing that far, however.)
2164  */
2165 bool
2166 XLogNeedsFlush(XLogRecPtr record)
2167 {
2168         /*
2169          * During recovery, we don't flush WAL but update minRecoveryPoint
2170          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2171          * would need to be updated.
2172          */
2173         if (RecoveryInProgress())
2174         {
2175                 /* Quick exit if already known updated */
2176                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
2177                         return false;
2178
2179                 /*
2180                  * Update local copy of minRecoveryPoint. But if the lock is busy,
2181                  * just return a conservative guess.
2182                  */
2183                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
2184                         return true;
2185                 minRecoveryPoint = ControlFile->minRecoveryPoint;
2186                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2187                 LWLockRelease(ControlFileLock);
2188
2189                 /*
2190                  * An invalid minRecoveryPoint means that we need to recover all the
2191                  * WAL, i.e., we're doing crash recovery.  We never modify the control
2192                  * file's value in that case, so we can short-circuit future checks
2193                  * here too.
2194                  */
2195                 if (minRecoveryPoint == 0)
2196                         updateMinRecoveryPoint = false;
2197
2198                 /* check again */
2199                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
2200                         return false;
2201                 else
2202                         return true;
2203         }
2204
2205         /* Quick exit if already known flushed */
2206         if (record <= LogwrtResult.Flush)
2207                 return false;
2208
2209         /* read LogwrtResult and update local state */
2210         {
2211                 /* use volatile pointer to prevent code rearrangement */
2212                 volatile XLogCtlData *xlogctl = XLogCtl;
2213
2214                 SpinLockAcquire(&xlogctl->info_lck);
2215                 LogwrtResult = xlogctl->LogwrtResult;
2216                 SpinLockRelease(&xlogctl->info_lck);
2217         }
2218
2219         /* check again */
2220         if (record <= LogwrtResult.Flush)
2221                 return false;
2222
2223         return true;
2224 }
2225
2226 /*
2227  * Create a new XLOG file segment, or open a pre-existing one.
2228  *
2229  * log, seg: identify segment to be created/opened.
2230  *
2231  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2232  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2233  * file was used.
2234  *
2235  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2236  * place.  This should be TRUE except during bootstrap log creation.  The
2237  * caller must *not* hold the lock at call.
2238  *
2239  * Returns FD of opened file.
2240  *
2241  * Note: errors here are ERROR not PANIC because we might or might not be
2242  * inside a critical section (eg, during checkpoint there is no reason to
2243  * take down the system on failure).  They will promote to PANIC if we are
2244  * in a critical section.
2245  */
2246 int
2247 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
2248 {
2249         char            path[MAXPGPATH];
2250         char            tmppath[MAXPGPATH];
2251         char       *zbuffer;
2252         XLogSegNo       installed_segno;
2253         int                     max_advance;
2254         int                     fd;
2255         int                     nbytes;
2256
2257         XLogFilePath(path, ThisTimeLineID, logsegno);
2258
2259         /*
2260          * Try to use existent file (checkpoint maker may have created it already)
2261          */
2262         if (*use_existent)
2263         {
2264                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2265                                                    S_IRUSR | S_IWUSR);
2266                 if (fd < 0)
2267                 {
2268                         if (errno != ENOENT)
2269                                 ereport(ERROR,
2270                                                 (errcode_for_file_access(),
2271                                                  errmsg("could not open file \"%s\": %m", path)));
2272                 }
2273                 else
2274                         return fd;
2275         }
2276
2277         /*
2278          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2279          * another process is doing the same thing.  If so, we will end up
2280          * pre-creating an extra log segment.  That seems OK, and better than
2281          * holding the lock throughout this lengthy process.
2282          */
2283         elog(DEBUG2, "creating and filling new WAL file");
2284
2285         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2286
2287         unlink(tmppath);
2288
2289         /*
2290          * Allocate a buffer full of zeros. This is done before opening the file
2291          * so that we don't leak the file descriptor if palloc fails.
2292          *
2293          * Note: palloc zbuffer, instead of just using a local char array, to
2294          * ensure it is reasonably well-aligned; this may save a few cycles
2295          * transferring data to the kernel.
2296          */
2297         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2298
2299         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2300         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2301                                            S_IRUSR | S_IWUSR);
2302         if (fd < 0)
2303                 ereport(ERROR,
2304                                 (errcode_for_file_access(),
2305                                  errmsg("could not create file \"%s\": %m", tmppath)));
2306
2307         /*
2308          * Zero-fill the file.  We have to do this the hard way to ensure that all
2309          * the file space has really been allocated --- on platforms that allow
2310          * "holes" in files, just seeking to the end doesn't allocate intermediate
2311          * space.  This way, we know that we have all the space and (after the
2312          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2313          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2314          * log file.
2315          */
2316         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2317         {
2318                 errno = 0;
2319                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2320                 {
2321                         int                     save_errno = errno;
2322
2323                         /*
2324                          * If we fail to make the file, delete it to release disk space
2325                          */
2326                         unlink(tmppath);
2327
2328                         close(fd);
2329
2330                         /* if write didn't set errno, assume problem is no disk space */
2331                         errno = save_errno ? save_errno : ENOSPC;
2332
2333                         ereport(ERROR,
2334                                         (errcode_for_file_access(),
2335                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2336                 }
2337         }
2338         pfree(zbuffer);
2339
2340         if (pg_fsync(fd) != 0)
2341         {
2342                 close(fd);
2343                 ereport(ERROR,
2344                                 (errcode_for_file_access(),
2345                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2346         }
2347
2348         if (close(fd))
2349                 ereport(ERROR,
2350                                 (errcode_for_file_access(),
2351                                  errmsg("could not close file \"%s\": %m", tmppath)));
2352
2353         /*
2354          * Now move the segment into place with its final name.
2355          *
2356          * If caller didn't want to use a pre-existing file, get rid of any
2357          * pre-existing file.  Otherwise, cope with possibility that someone else
2358          * has created the file while we were filling ours: if so, use ours to
2359          * pre-create a future log segment.
2360          */
2361         installed_segno = logsegno;
2362         max_advance = XLOGfileslop;
2363         if (!InstallXLogFileSegment(&installed_segno, tmppath,
2364                                                                 *use_existent, &max_advance,
2365                                                                 use_lock))
2366         {
2367                 /*
2368                  * No need for any more future segments, or InstallXLogFileSegment()
2369                  * failed to rename the file into place. If the rename failed, opening
2370                  * the file below will fail.
2371                  */
2372                 unlink(tmppath);
2373         }
2374
2375         /* Set flag to tell caller there was no existent file */
2376         *use_existent = false;
2377
2378         /* Now open original target segment (might not be file I just made) */
2379         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2380                                            S_IRUSR | S_IWUSR);
2381         if (fd < 0)
2382                 ereport(ERROR,
2383                                 (errcode_for_file_access(),
2384                                  errmsg("could not open file \"%s\": %m", path)));
2385
2386         elog(DEBUG2, "done creating and filling new WAL file");
2387
2388         return fd;
2389 }
2390
2391 /*
2392  * Create a new XLOG file segment by copying a pre-existing one.
2393  *
2394  * destsegno: identify segment to be created.
2395  *
2396  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2397  *              a different timeline)
2398  *
2399  * Currently this is only used during recovery, and so there are no locking
2400  * considerations.      But we should be just as tense as XLogFileInit to avoid
2401  * emplacing a bogus file.
2402  */
2403 static void
2404 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
2405 {
2406         char            path[MAXPGPATH];
2407         char            tmppath[MAXPGPATH];
2408         char            buffer[XLOG_BLCKSZ];
2409         int                     srcfd;
2410         int                     fd;
2411         int                     nbytes;
2412
2413         /*
2414          * Open the source file
2415          */
2416         XLogFilePath(path, srcTLI, srcsegno);
2417         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
2418         if (srcfd < 0)
2419                 ereport(ERROR,
2420                                 (errcode_for_file_access(),
2421                                  errmsg("could not open file \"%s\": %m", path)));
2422
2423         /*
2424          * Copy into a temp file name.
2425          */
2426         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2427
2428         unlink(tmppath);
2429
2430         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2431         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2432                                                    S_IRUSR | S_IWUSR);
2433         if (fd < 0)
2434                 ereport(ERROR,
2435                                 (errcode_for_file_access(),
2436                                  errmsg("could not create file \"%s\": %m", tmppath)));
2437
2438         /*
2439          * Do the data copying.
2440          */
2441         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2442         {
2443                 errno = 0;
2444                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2445                 {
2446                         if (errno != 0)
2447                                 ereport(ERROR,
2448                                                 (errcode_for_file_access(),
2449                                                  errmsg("could not read file \"%s\": %m", path)));
2450                         else
2451                                 ereport(ERROR,
2452                                                 (errmsg("not enough data in file \"%s\"", path)));
2453                 }
2454                 errno = 0;
2455                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2456                 {
2457                         int                     save_errno = errno;
2458
2459                         /*
2460                          * If we fail to make the file, delete it to release disk space
2461                          */
2462                         unlink(tmppath);
2463                         /* if write didn't set errno, assume problem is no disk space */
2464                         errno = save_errno ? save_errno : ENOSPC;
2465
2466                         ereport(ERROR,
2467                                         (errcode_for_file_access(),
2468                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2469                 }
2470         }
2471
2472         if (pg_fsync(fd) != 0)
2473                 ereport(ERROR,
2474                                 (errcode_for_file_access(),
2475                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2476
2477         if (CloseTransientFile(fd))
2478                 ereport(ERROR,
2479                                 (errcode_for_file_access(),
2480                                  errmsg("could not close file \"%s\": %m", tmppath)));
2481
2482         CloseTransientFile(srcfd);
2483
2484         /*
2485          * Now move the segment into place with its final name.
2486          */
2487         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
2488                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2489 }
2490
2491 /*
2492  * Install a new XLOG segment file as a current or future log segment.
2493  *
2494  * This is used both to install a newly-created segment (which has a temp
2495  * filename while it's being created) and to recycle an old segment.
2496  *
2497  * *segno: identify segment to install as (or first possible target).
2498  * When find_free is TRUE, this is modified on return to indicate the
2499  * actual installation location or last segment searched.
2500  *
2501  * tmppath: initial name of file to install.  It will be renamed into place.
2502  *
2503  * find_free: if TRUE, install the new segment at the first empty segno
2504  * number at or after the passed numbers.  If FALSE, install the new segment
2505  * exactly where specified, deleting any existing segment file there.
2506  *
2507  * *max_advance: maximum number of segno slots to advance past the starting
2508  * point.  Fail if no free slot is found in this range.  On return, reduced
2509  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2510  * when find_free is FALSE.)
2511  *
2512  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2513  * place.  This should be TRUE except during bootstrap log creation.  The
2514  * caller must *not* hold the lock at call.
2515  *
2516  * Returns TRUE if the file was installed successfully.  FALSE indicates that
2517  * max_advance limit was exceeded, or an error occurred while renaming the
2518  * file into place.
2519  */
2520 static bool
2521 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
2522                                            bool find_free, int *max_advance,
2523                                            bool use_lock)
2524 {
2525         char            path[MAXPGPATH];
2526         struct stat stat_buf;
2527
2528         XLogFilePath(path, ThisTimeLineID, *segno);
2529
2530         /*
2531          * We want to be sure that only one process does this at a time.
2532          */
2533         if (use_lock)
2534                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2535
2536         if (!find_free)
2537         {
2538                 /* Force installation: get rid of any pre-existing segment file */
2539                 unlink(path);
2540         }
2541         else
2542         {
2543                 /* Find a free slot to put it in */
2544                 while (stat(path, &stat_buf) == 0)
2545                 {
2546                         if (*max_advance <= 0)
2547                         {
2548                                 /* Failed to find a free slot within specified range */
2549                                 if (use_lock)
2550                                         LWLockRelease(ControlFileLock);
2551                                 return false;
2552                         }
2553                         (*segno)++;
2554                         (*max_advance)--;
2555                         XLogFilePath(path, ThisTimeLineID, *segno);
2556                 }
2557         }
2558
2559         /*
2560          * Prefer link() to rename() here just to be really sure that we don't
2561          * overwrite an existing logfile.  However, there shouldn't be one, so
2562          * rename() is an acceptable substitute except for the truly paranoid.
2563          */
2564 #if HAVE_WORKING_LINK
2565         if (link(tmppath, path) < 0)
2566         {
2567                 if (use_lock)
2568                         LWLockRelease(ControlFileLock);
2569                 ereport(LOG,
2570                                 (errcode_for_file_access(),
2571                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
2572                                                 tmppath, path)));
2573                 return false;
2574         }
2575         unlink(tmppath);
2576 #else
2577         if (rename(tmppath, path) < 0)
2578         {
2579                 if (use_lock)
2580                         LWLockRelease(ControlFileLock);
2581                 ereport(LOG,
2582                                 (errcode_for_file_access(),
2583                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
2584                                                 tmppath, path)));
2585                 return false;
2586         }
2587 #endif
2588
2589         if (use_lock)
2590                 LWLockRelease(ControlFileLock);
2591
2592         return true;
2593 }
2594
2595 /*
2596  * Open a pre-existing logfile segment for writing.
2597  */
2598 int
2599 XLogFileOpen(XLogSegNo segno)
2600 {
2601         char            path[MAXPGPATH];
2602         int                     fd;
2603
2604         XLogFilePath(path, ThisTimeLineID, segno);
2605
2606         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2607                                            S_IRUSR | S_IWUSR);
2608         if (fd < 0)
2609                 ereport(PANIC,
2610                                 (errcode_for_file_access(),
2611                                  errmsg("could not open xlog file \"%s\": %m", path)));
2612
2613         return fd;
2614 }
2615
2616 /*
2617  * Open a logfile segment for reading (during recovery).
2618  *
2619  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
2620  * Otherwise, it's assumed to be already available in pg_xlog.
2621  */
2622 static int
2623 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
2624                          int source, bool notfoundOk)
2625 {
2626         char            xlogfname[MAXFNAMELEN];
2627         char            activitymsg[MAXFNAMELEN + 16];
2628         char            path[MAXPGPATH];
2629         int                     fd;
2630
2631         XLogFileName(xlogfname, tli, segno);
2632
2633         switch (source)
2634         {
2635                 case XLOG_FROM_ARCHIVE:
2636                         /* Report recovery progress in PS display */
2637                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2638                                          xlogfname);
2639                         set_ps_display(activitymsg, false);
2640
2641                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2642                                                                                                           "RECOVERYXLOG",
2643                                                                                                           XLogSegSize,
2644                                                                                                           InRedo);
2645                         if (!restoredFromArchive)
2646                                 return -1;
2647                         break;
2648
2649                 case XLOG_FROM_PG_XLOG:
2650                 case XLOG_FROM_STREAM:
2651                         XLogFilePath(path, tli, segno);
2652                         restoredFromArchive = false;
2653                         break;
2654
2655                 default:
2656                         elog(ERROR, "invalid XLogFileRead source %d", source);
2657         }
2658
2659         /*
2660          * If the segment was fetched from archival storage, replace the existing
2661          * xlog segment (if any) with the archival version.
2662          */
2663         if (source == XLOG_FROM_ARCHIVE)
2664         {
2665                 KeepFileRestoredFromArchive(path, xlogfname);
2666
2667                 /*
2668                  * Set path to point at the new file in pg_xlog.
2669                  */
2670                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
2671         }
2672
2673         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2674         if (fd >= 0)
2675         {
2676                 /* Success! */
2677                 curFileTLI = tli;
2678
2679                 /* Report recovery progress in PS display */
2680                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2681                                  xlogfname);
2682                 set_ps_display(activitymsg, false);
2683
2684                 /* Track source of data in assorted state variables */
2685                 readSource = source;
2686                 XLogReceiptSource = source;
2687                 /* In FROM_STREAM case, caller tracks receipt time, not me */
2688                 if (source != XLOG_FROM_STREAM)
2689                         XLogReceiptTime = GetCurrentTimestamp();
2690
2691                 return fd;
2692         }
2693         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
2694                 ereport(PANIC,
2695                                 (errcode_for_file_access(),
2696                                  errmsg("could not open file \"%s\": %m", path)));
2697         return -1;
2698 }
2699
2700 /*
2701  * Open a logfile segment for reading (during recovery).
2702  *
2703  * This version searches for the segment with any TLI listed in expectedTLEs.
2704  */
2705 static int
2706 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
2707 {
2708         char            path[MAXPGPATH];
2709         ListCell   *cell;
2710         int                     fd;
2711         List       *tles;
2712
2713         /*
2714          * Loop looking for a suitable timeline ID: we might need to read any of
2715          * the timelines listed in expectedTLEs.
2716          *
2717          * We expect curFileTLI on entry to be the TLI of the preceding file in
2718          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2719          * to go backwards; this prevents us from picking up the wrong file when a
2720          * parent timeline extends to higher segment numbers than the child we
2721          * want to read.
2722          *
2723          * If we haven't read the timeline history file yet, read it now, so that
2724          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
2725          * however, unless we actually find a valid segment.  That way if there is
2726          * neither a timeline history file nor a WAL segment in the archive, and
2727          * streaming replication is set up, we'll read the timeline history file
2728          * streamed from the master when we start streaming, instead of recovering
2729          * with a dummy history generated here.
2730          */
2731         if (expectedTLEs)
2732                 tles = expectedTLEs;
2733         else
2734                 tles = readTimeLineHistory(recoveryTargetTLI);
2735
2736         foreach(cell, tles)
2737         {
2738                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
2739
2740                 if (tli < curFileTLI)
2741                         break;                          /* don't bother looking at too-old TLIs */
2742
2743                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
2744                 {
2745                         fd = XLogFileRead(segno, emode, tli,
2746                                                           XLOG_FROM_ARCHIVE, true);
2747                         if (fd != -1)
2748                         {
2749                                 elog(DEBUG1, "got WAL segment from archive");
2750                                 if (!expectedTLEs)
2751                                         expectedTLEs = tles;
2752                                 return fd;
2753                         }
2754                 }
2755
2756                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
2757                 {
2758                         fd = XLogFileRead(segno, emode, tli,
2759                                                           XLOG_FROM_PG_XLOG, true);
2760                         if (fd != -1)
2761                         {
2762                                 if (!expectedTLEs)
2763                                         expectedTLEs = tles;
2764                                 return fd;
2765                         }
2766                 }
2767         }
2768
2769         /* Couldn't find it.  For simplicity, complain about front timeline */
2770         XLogFilePath(path, recoveryTargetTLI, segno);
2771         errno = ENOENT;
2772         ereport(emode,
2773                         (errcode_for_file_access(),
2774                          errmsg("could not open file \"%s\": %m", path)));
2775         return -1;
2776 }
2777
2778 /*
2779  * Close the current logfile segment for writing.
2780  */
2781 static void
2782 XLogFileClose(void)
2783 {
2784         Assert(openLogFile >= 0);
2785
2786         /*
2787          * WAL segment files will not be re-read in normal operation, so we advise
2788          * the OS to release any cached pages.  But do not do so if WAL archiving
2789          * or streaming is active, because archiver and walsender process could
2790          * use the cache to read the WAL segment.
2791          */
2792 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2793         if (!XLogIsNeeded())
2794                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2795 #endif
2796
2797         if (close(openLogFile))
2798                 ereport(PANIC,
2799                                 (errcode_for_file_access(),
2800                                  errmsg("could not close log file %s: %m",
2801                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
2802         openLogFile = -1;
2803 }
2804
2805 /*
2806  * Preallocate log files beyond the specified log endpoint.
2807  *
2808  * XXX this is currently extremely conservative, since it forces only one
2809  * future log segment to exist, and even that only if we are 75% done with
2810  * the current one.  This is only appropriate for very low-WAL-volume systems.
2811  * High-volume systems will be OK once they've built up a sufficient set of
2812  * recycled log segments, but the startup transient is likely to include
2813  * a lot of segment creations by foreground processes, which is not so good.
2814  */
2815 static void
2816 PreallocXlogFiles(XLogRecPtr endptr)
2817 {
2818         XLogSegNo       _logSegNo;
2819         int                     lf;
2820         bool            use_existent;
2821
2822         XLByteToPrevSeg(endptr, _logSegNo);
2823         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
2824         {
2825                 _logSegNo++;
2826                 use_existent = true;
2827                 lf = XLogFileInit(_logSegNo, &use_existent, true);
2828                 close(lf);
2829                 if (!use_existent)
2830                         CheckpointStats.ckpt_segs_added++;
2831         }
2832 }
2833
2834 /*
2835  * Throws an error if the given log segment has already been removed or
2836  * recycled. The caller should only pass a segment that it knows to have
2837  * existed while the server has been running, as this function always
2838  * succeeds if no WAL segments have been removed since startup.
2839  * 'tli' is only used in the error message.
2840  */
2841 void
2842 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
2843 {
2844         /* use volatile pointer to prevent code rearrangement */
2845         volatile XLogCtlData *xlogctl = XLogCtl;
2846         XLogSegNo       lastRemovedSegNo;
2847
2848         SpinLockAcquire(&xlogctl->info_lck);
2849         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
2850         SpinLockRelease(&xlogctl->info_lck);
2851
2852         if (segno <= lastRemovedSegNo)
2853         {
2854                 char            filename[MAXFNAMELEN];
2855
2856                 XLogFileName(filename, tli, segno);
2857                 ereport(ERROR,
2858                                 (errcode_for_file_access(),
2859                                  errmsg("requested WAL segment %s has already been removed",
2860                                                 filename)));
2861         }
2862 }
2863
2864 /*
2865  * Update the last removed segno pointer in shared memory, to reflect
2866  * that the given XLOG file has been removed.
2867  */
2868 static void
2869 UpdateLastRemovedPtr(char *filename)
2870 {
2871         /* use volatile pointer to prevent code rearrangement */
2872         volatile XLogCtlData *xlogctl = XLogCtl;
2873         uint32          tli;
2874         XLogSegNo       segno;
2875
2876         XLogFromFileName(filename, &tli, &segno);
2877
2878         SpinLockAcquire(&xlogctl->info_lck);
2879         if (segno > xlogctl->lastRemovedSegNo)
2880                 xlogctl->lastRemovedSegNo = segno;
2881         SpinLockRelease(&xlogctl->info_lck);
2882 }
2883
2884 /*
2885  * Recycle or remove all log files older or equal to passed segno
2886  *
2887  * endptr is current (or recent) end of xlog; this is used to determine
2888  * whether we want to recycle rather than delete no-longer-wanted log files.
2889  */
2890 static void
2891 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
2892 {
2893         XLogSegNo       endlogSegNo;
2894         int                     max_advance;
2895         DIR                *xldir;
2896         struct dirent *xlde;
2897         char            lastoff[MAXFNAMELEN];
2898         char            path[MAXPGPATH];
2899
2900 #ifdef WIN32
2901         char            newpath[MAXPGPATH];
2902 #endif
2903         struct stat statbuf;
2904
2905         /*
2906          * Initialize info about where to try to recycle to.  We allow recycling
2907          * segments up to XLOGfileslop segments beyond the current XLOG location.
2908          */
2909         XLByteToPrevSeg(endptr, endlogSegNo);
2910         max_advance = XLOGfileslop;
2911
2912         xldir = AllocateDir(XLOGDIR);
2913         if (xldir == NULL)
2914                 ereport(ERROR,
2915                                 (errcode_for_file_access(),
2916                                  errmsg("could not open transaction log directory \"%s\": %m",
2917                                                 XLOGDIR)));
2918
2919         /*
2920          * Construct a filename of the last segment to be kept. The timeline ID
2921          * doesn't matter, we ignore that in the comparison. (During recovery,
2922          * ThisTimeLineID isn't set, so we can't use that.)
2923          */
2924         XLogFileName(lastoff, 0, segno);
2925
2926         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
2927                  lastoff);
2928
2929         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
2930         {
2931                 /*
2932                  * We ignore the timeline part of the XLOG segment identifiers in
2933                  * deciding whether a segment is still needed.  This ensures that we
2934                  * won't prematurely remove a segment from a parent timeline. We could
2935                  * probably be a little more proactive about removing segments of
2936                  * non-parent timelines, but that would be a whole lot more
2937                  * complicated.
2938                  *
2939                  * We use the alphanumeric sorting property of the filenames to decide
2940                  * which ones are earlier than the lastoff segment.
2941                  */
2942                 if (strlen(xlde->d_name) == 24 &&
2943                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
2944                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
2945                 {
2946                         if (XLogArchiveCheckDone(xlde->d_name))
2947                         {
2948                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
2949
2950                                 /* Update the last removed location in shared memory first */
2951                                 UpdateLastRemovedPtr(xlde->d_name);
2952
2953                                 /*
2954                                  * Before deleting the file, see if it can be recycled as a
2955                                  * future log segment. Only recycle normal files, pg_standby
2956                                  * for example can create symbolic links pointing to a
2957                                  * separate archive directory.
2958                                  */
2959                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
2960                                         InstallXLogFileSegment(&endlogSegNo, path,
2961                                                                                    true, &max_advance, true))
2962                                 {
2963                                         ereport(DEBUG2,
2964                                                         (errmsg("recycled transaction log file \"%s\"",
2965                                                                         xlde->d_name)));
2966                                         CheckpointStats.ckpt_segs_recycled++;
2967                                         /* Needn't recheck that slot on future iterations */
2968                                         if (max_advance > 0)
2969                                         {
2970                                                 endlogSegNo++;
2971                                                 max_advance--;
2972                                         }
2973                                 }
2974                                 else
2975                                 {
2976                                         /* No need for any more future segments... */
2977                                         int                     rc;
2978
2979                                         ereport(DEBUG2,
2980                                                         (errmsg("removing transaction log file \"%s\"",
2981                                                                         xlde->d_name)));
2982
2983 #ifdef WIN32
2984
2985                                         /*
2986                                          * On Windows, if another process (e.g another backend)
2987                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
2988                                          * will succeed, but the file will still show up in
2989                                          * directory listing until the last handle is closed. To
2990                                          * avoid confusing the lingering deleted file for a live
2991                                          * WAL file that needs to be archived, rename it before
2992                                          * deleting it.
2993                                          *
2994                                          * If another process holds the file open without
2995                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
2996                                          * again at the next checkpoint.
2997                                          */
2998                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
2999                                         if (rename(path, newpath) != 0)
3000                                         {
3001                                                 ereport(LOG,
3002                                                                 (errcode_for_file_access(),
3003                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
3004                                                                                 path)));
3005                                                 continue;
3006                                         }
3007                                         rc = unlink(newpath);
3008 #else
3009                                         rc = unlink(path);
3010 #endif
3011                                         if (rc != 0)
3012                                         {
3013                                                 ereport(LOG,
3014                                                                 (errcode_for_file_access(),
3015                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
3016                                                                                 path)));
3017                                                 continue;
3018                                         }
3019                                         CheckpointStats.ckpt_segs_removed++;
3020                                 }
3021
3022                                 XLogArchiveCleanup(xlde->d_name);
3023                         }
3024                 }
3025         }
3026
3027         FreeDir(xldir);
3028 }
3029
3030 /*
3031  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3032  * If the latter does not exist, recreate it.
3033  *
3034  * It is not the goal of this function to verify the contents of these
3035  * directories, but to help in cases where someone has performed a cluster
3036  * copy for PITR purposes but omitted pg_xlog from the copy.
3037  *
3038  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3039  * policy decision was made not to.  It is fairly common for pg_xlog to be
3040  * a symlink, and if that was the DBA's intent then automatically making a
3041  * plain directory would result in degraded performance with no notice.
3042  */
3043 static void
3044 ValidateXLOGDirectoryStructure(void)
3045 {
3046         char            path[MAXPGPATH];
3047         struct stat stat_buf;
3048
3049         /* Check for pg_xlog; if it doesn't exist, error out */
3050         if (stat(XLOGDIR, &stat_buf) != 0 ||
3051                 !S_ISDIR(stat_buf.st_mode))
3052                 ereport(FATAL,
3053                                 (errmsg("required WAL directory \"%s\" does not exist",
3054                                                 XLOGDIR)));
3055
3056         /* Check for archive_status */
3057         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3058         if (stat(path, &stat_buf) == 0)
3059         {
3060                 /* Check for weird cases where it exists but isn't a directory */
3061                 if (!S_ISDIR(stat_buf.st_mode))
3062                         ereport(FATAL,
3063                                         (errmsg("required WAL directory \"%s\" does not exist",
3064                                                         path)));
3065         }
3066         else
3067         {
3068                 ereport(LOG,
3069                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3070                 if (mkdir(path, S_IRWXU) < 0)
3071                         ereport(FATAL,
3072                                         (errmsg("could not create missing directory \"%s\": %m",
3073                                                         path)));
3074         }
3075 }
3076
3077 /*
3078  * Remove previous backup history files.  This also retries creation of
3079  * .ready files for any backup history files for which XLogArchiveNotify
3080  * failed earlier.
3081  */
3082 static void
3083 CleanupBackupHistory(void)
3084 {
3085         DIR                *xldir;
3086         struct dirent *xlde;
3087         char            path[MAXPGPATH];
3088
3089         xldir = AllocateDir(XLOGDIR);
3090         if (xldir == NULL)
3091                 ereport(ERROR,
3092                                 (errcode_for_file_access(),
3093                                  errmsg("could not open transaction log directory \"%s\": %m",
3094                                                 XLOGDIR)));
3095
3096         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3097         {
3098                 if (strlen(xlde->d_name) > 24 &&
3099                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3100                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3101                                    ".backup") == 0)
3102                 {
3103                         if (XLogArchiveCheckDone(xlde->d_name))
3104                         {
3105                                 ereport(DEBUG2,
3106                                 (errmsg("removing transaction log backup history file \"%s\"",
3107                                                 xlde->d_name)));
3108                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3109                                 unlink(path);
3110                                 XLogArchiveCleanup(xlde->d_name);
3111                         }
3112                 }
3113         }
3114
3115         FreeDir(xldir);
3116 }
3117
3118 /*
3119  * Restore a full-page image from a backup block attached to an XLOG record.
3120  *
3121  * lsn: LSN of the XLOG record being replayed
3122  * record: the complete XLOG record
3123  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
3124  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
3125  * keep_buffer: TRUE to return the buffer still locked and pinned
3126  *
3127  * Returns the buffer number containing the page.  Note this is not terribly
3128  * useful unless keep_buffer is specified as TRUE.
3129  *
3130  * Note: when a backup block is available in XLOG, we restore it
3131  * unconditionally, even if the page in the database appears newer.
3132  * This is to protect ourselves against database pages that were partially
3133  * or incorrectly written during a crash.  We assume that the XLOG data
3134  * must be good because it has passed a CRC check, while the database
3135  * page might not be.  This will force us to replay all subsequent
3136  * modifications of the page that appear in XLOG, rather than possibly
3137  * ignoring them as already applied, but that's not a huge drawback.
3138  *
3139  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
3140  * else a normal exclusive lock is used.  During crash recovery, that's just
3141  * pro forma because there can't be any regular backends in the system, but
3142  * in hot standby mode the distinction is important.
3143  *
3144  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
3145  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
3146  * is needed in some cases when replaying XLOG records that touch multiple
3147  * pages, to prevent inconsistent states from being visible to other backends.
3148  * (Again, that's only important in hot standby mode.)
3149  */
3150 Buffer
3151 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
3152                                    bool get_cleanup_lock, bool keep_buffer)
3153 {
3154         BkpBlock        bkpb;
3155         char       *blk;
3156         int                     i;
3157
3158         /* Locate requested BkpBlock in the record */
3159         blk = (char *) XLogRecGetData(record) + record->xl_len;
3160         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3161         {
3162                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
3163                         continue;
3164
3165                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3166                 blk += sizeof(BkpBlock);
3167
3168                 if (i == block_index)
3169                 {
3170                         /* Found it, apply the update */
3171                         return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
3172                                                                                           keep_buffer);
3173                 }
3174
3175                 blk += BLCKSZ - bkpb.hole_length;
3176         }
3177
3178         /* Caller specified a bogus block_index */
3179         elog(ERROR, "failed to restore block_index %d", block_index);
3180         return InvalidBuffer;           /* keep compiler quiet */
3181 }
3182
3183 /*
3184  * Workhorse for RestoreBackupBlock usable without an xlog record
3185  *
3186  * Restores a full-page image from BkpBlock and a data pointer.
3187  */
3188 static Buffer
3189 RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
3190                                                    bool get_cleanup_lock, bool keep_buffer)
3191 {
3192         Buffer          buffer;
3193         Page            page;
3194
3195         buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
3196                                                                         RBM_ZERO);
3197         Assert(BufferIsValid(buffer));
3198         if (get_cleanup_lock)
3199                 LockBufferForCleanup(buffer);
3200         else
3201                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3202
3203         page = (Page) BufferGetPage(buffer);
3204
3205         if (bkpb.hole_length == 0)
3206         {
3207                 memcpy((char *) page, blk, BLCKSZ);
3208         }
3209         else
3210         {
3211                 memcpy((char *) page, blk, bkpb.hole_offset);
3212                 /* must zero-fill the hole */
3213                 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
3214                 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
3215                            blk + bkpb.hole_offset,
3216                            BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3217         }
3218
3219         /*
3220          * The checksum value on this page is currently invalid. We don't need to
3221          * reset it here since it will be set before being written.
3222          */
3223
3224         PageSetLSN(page, lsn);
3225         MarkBufferDirty(buffer);
3226
3227         if (!keep_buffer)
3228                 UnlockReleaseBuffer(buffer);
3229
3230         return buffer;
3231 }
3232
3233 /*
3234  * Attempt to read an XLOG record.
3235  *
3236  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3237  * try to read a record just after the last one previously read.
3238  *
3239  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3240  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3241  * record is available.
3242  *
3243  * The record is copied into readRecordBuf, so that on successful return,
3244  * the returned record pointer always points there.
3245  */
3246 static XLogRecord *
3247 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
3248                    bool fetching_ckpt)
3249 {
3250         XLogRecord *record;
3251         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3252
3253         /* Pass through parameters to XLogPageRead */
3254         private->fetching_ckpt = fetching_ckpt;
3255         private->emode = emode;
3256         private->randAccess = (RecPtr != InvalidXLogRecPtr);
3257
3258         /* This is the first attempt to read this page. */
3259         lastSourceFailed = false;
3260
3261         for (;;)
3262         {
3263                 char       *errormsg;
3264
3265                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
3266                 ReadRecPtr = xlogreader->ReadRecPtr;
3267                 EndRecPtr = xlogreader->EndRecPtr;
3268                 if (record == NULL)
3269                 {
3270                         if (readFile >= 0)
3271                         {
3272                                 close(readFile);
3273                                 readFile = -1;
3274                         }
3275
3276                         /*
3277                          * We only end up here without a message when XLogPageRead()
3278                          * failed - in that case we already logged something. In
3279                          * StandbyMode that only happens if we have been triggered, so we
3280                          * shouldn't loop anymore in that case.
3281                          */
3282                         if (errormsg)
3283                                 ereport(emode_for_corrupt_record(emode,
3284                                                                                                  RecPtr ? RecPtr : EndRecPtr),
3285                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
3286                 }
3287
3288                 /*
3289                  * Check page TLI is one of the expected values.
3290                  */
3291                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3292                 {
3293                         char            fname[MAXFNAMELEN];
3294                         XLogSegNo       segno;
3295                         int32           offset;
3296
3297                         XLByteToSeg(xlogreader->latestPagePtr, segno);
3298                         offset = xlogreader->latestPagePtr % XLogSegSize;
3299                         XLogFileName(fname, xlogreader->readPageTLI, segno);
3300                         ereport(emode_for_corrupt_record(emode,
3301                                                                                          RecPtr ? RecPtr : EndRecPtr),
3302                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
3303                                         xlogreader->latestPageTLI,
3304                                         fname,
3305                                         offset)));
3306                         record = NULL;
3307                 }
3308
3309                 if (record)
3310                 {
3311                         /* Great, got a record */
3312                         return record;
3313                 }
3314                 else
3315                 {
3316                         /* No valid record available from this source */
3317                         lastSourceFailed = true;
3318
3319                         /*
3320                          * If archive recovery was requested, but we were still doing
3321                          * crash recovery, switch to archive recovery and retry using the
3322                          * offline archive. We have now replayed all the valid WAL in
3323                          * pg_xlog, so we are presumably now consistent.
3324                          *
3325                          * We require that there's at least some valid WAL present in
3326                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
3327                          * from the archive, even if pg_xlog is completely empty, but we'd
3328                          * have no idea how far we'd have to replay to reach consistency.
3329                          * So err on the safe side and give up.
3330                          */
3331                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3332                                 !fetching_ckpt)
3333                         {
3334                                 ereport(DEBUG1,
3335                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
3336                                 InArchiveRecovery = true;
3337                                 if (StandbyModeRequested)
3338                                         StandbyMode = true;
3339
3340                                 /* initialize minRecoveryPoint to this record */
3341                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3342                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
3343                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
3344                                 {
3345                                         ControlFile->minRecoveryPoint = EndRecPtr;
3346                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
3347                                 }
3348                                 /* update local copy */
3349                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3350                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3351
3352                                 UpdateControlFile();
3353                                 LWLockRelease(ControlFileLock);
3354
3355                                 CheckRecoveryConsistency();
3356
3357                                 /*
3358                                  * Before we retry, reset lastSourceFailed and currentSource
3359                                  * so that we will check the archive next.
3360                                  */
3361                                 lastSourceFailed = false;
3362                                 currentSource = 0;
3363
3364                                 continue;
3365                         }
3366
3367                         /* In standby mode, loop back to retry. Otherwise, give up. */
3368                         if (StandbyMode && !CheckForStandbyTrigger())
3369                                 continue;
3370                         else
3371                                 return NULL;
3372                 }
3373         }
3374 }
3375
3376 /*
3377  * Scan for new timelines that might have appeared in the archive since we
3378  * started recovery.
3379  *
3380  * If there are any, the function changes recovery target TLI to the latest
3381  * one and returns 'true'.
3382  */
3383 static bool
3384 rescanLatestTimeLine(void)
3385 {
3386         List       *newExpectedTLEs;
3387         bool            found;
3388         ListCell   *cell;
3389         TimeLineID      newtarget;
3390         TimeLineID      oldtarget = recoveryTargetTLI;
3391         TimeLineHistoryEntry *currentTle = NULL;
3392
3393         newtarget = findNewestTimeLine(recoveryTargetTLI);
3394         if (newtarget == recoveryTargetTLI)
3395         {
3396                 /* No new timelines found */
3397                 return false;
3398         }
3399
3400         /*
3401          * Determine the list of expected TLIs for the new TLI
3402          */
3403
3404         newExpectedTLEs = readTimeLineHistory(newtarget);
3405
3406         /*
3407          * If the current timeline is not part of the history of the new timeline,
3408          * we cannot proceed to it.
3409          */
3410         found = false;
3411         foreach(cell, newExpectedTLEs)
3412         {
3413                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
3414
3415                 if (currentTle->tli == recoveryTargetTLI)
3416                 {
3417                         found = true;
3418                         break;
3419                 }
3420         }
3421         if (!found)
3422         {
3423                 ereport(LOG,
3424                                 (errmsg("new timeline %u is not a child of database system timeline %u",
3425                                                 newtarget,
3426                                                 ThisTimeLineID)));
3427                 return false;
3428         }
3429
3430         /*
3431          * The current timeline was found in the history file, but check that the
3432          * next timeline was forked off from it *after* the current recovery
3433          * location.
3434          */
3435         if (currentTle->end < EndRecPtr)
3436         {
3437                 ereport(LOG,
3438                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
3439                                                 newtarget,
3440                                                 ThisTimeLineID,
3441                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
3442                 return false;
3443         }
3444
3445         /* The new timeline history seems valid. Switch target */
3446         recoveryTargetTLI = newtarget;
3447         list_free_deep(expectedTLEs);
3448         expectedTLEs = newExpectedTLEs;
3449
3450         /*
3451          * As in StartupXLOG(), try to ensure we have all the history files
3452          * between the old target and new target in pg_xlog.
3453          */
3454         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
3455
3456         ereport(LOG,
3457                         (errmsg("new target timeline is %u",
3458                                         recoveryTargetTLI)));
3459
3460         return true;
3461 }
3462
3463 /*
3464  * I/O routines for pg_control
3465  *
3466  * *ControlFile is a buffer in shared memory that holds an image of the
3467  * contents of pg_control.      WriteControlFile() initializes pg_control
3468  * given a preloaded buffer, ReadControlFile() loads the buffer from
3469  * the pg_control file (during postmaster or standalone-backend startup),
3470  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
3471  *
3472  * For simplicity, WriteControlFile() initializes the fields of pg_control
3473  * that are related to checking backend/database compatibility, and
3474  * ReadControlFile() verifies they are correct.  We could split out the
3475  * I/O and compatibility-check functions, but there seems no need currently.
3476  */
3477 static void
3478 WriteControlFile(void)
3479 {
3480         int                     fd;
3481         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
3482
3483         /*
3484          * Initialize version and compatibility-check fields
3485          */
3486         ControlFile->pg_control_version = PG_CONTROL_VERSION;
3487         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
3488
3489         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
3490         ControlFile->floatFormat = FLOATFORMAT_VALUE;
3491
3492         ControlFile->blcksz = BLCKSZ;
3493         ControlFile->relseg_size = RELSEG_SIZE;
3494         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
3495         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
3496
3497         ControlFile->nameDataLen = NAMEDATALEN;
3498         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
3499
3500         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
3501
3502 #ifdef HAVE_INT64_TIMESTAMP
3503         ControlFile->enableIntTimes = true;
3504 #else
3505         ControlFile->enableIntTimes = false;
3506 #endif
3507         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
3508         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
3509
3510         /* Contents are protected with a CRC */
3511         INIT_CRC32(ControlFile->crc);
3512         COMP_CRC32(ControlFile->crc,
3513                            (char *) ControlFile,
3514                            offsetof(ControlFileData, crc));
3515         FIN_CRC32(ControlFile->crc);
3516
3517         /*
3518          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
3519          * excess over sizeof(ControlFileData).  This reduces the odds of
3520          * premature-EOF errors when reading pg_control.  We'll still fail when we
3521          * check the contents of the file, but hopefully with a more specific
3522          * error than "couldn't read pg_control".
3523          */
3524         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
3525                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
3526
3527         memset(buffer, 0, PG_CONTROL_SIZE);
3528         memcpy(buffer, ControlFile, sizeof(ControlFileData));
3529
3530         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3531                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3532                                            S_IRUSR | S_IWUSR);
3533         if (fd < 0)
3534                 ereport(PANIC,
3535                                 (errcode_for_file_access(),
3536                                  errmsg("could not create control file \"%s\": %m",
3537                                                 XLOG_CONTROL_FILE)));
3538
3539         errno = 0;
3540         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
3541         {
3542                 /* if write didn't set errno, assume problem is no disk space */
3543                 if (errno == 0)
3544                         errno = ENOSPC;
3545                 ereport(PANIC,
3546                                 (errcode_for_file_access(),
3547                                  errmsg("could not write to control file: %m")));
3548         }
3549
3550         if (pg_fsync(fd) != 0)
3551                 ereport(PANIC,
3552                                 (errcode_for_file_access(),
3553                                  errmsg("could not fsync control file: %m")));
3554
3555         if (close(fd))
3556                 ereport(PANIC,
3557                                 (errcode_for_file_access(),
3558                                  errmsg("could not close control file: %m")));
3559 }
3560
3561 static void
3562 ReadControlFile(void)
3563 {
3564         pg_crc32        crc;
3565         int                     fd;
3566
3567         /*
3568          * Read data...
3569          */
3570         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3571                                            O_RDWR | PG_BINARY,
3572                                            S_IRUSR | S_IWUSR);
3573         if (fd < 0)
3574                 ereport(PANIC,
3575                                 (errcode_for_file_access(),
3576                                  errmsg("could not open control file \"%s\": %m",
3577                                                 XLOG_CONTROL_FILE)));
3578
3579         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
3580                 ereport(PANIC,
3581                                 (errcode_for_file_access(),
3582                                  errmsg("could not read from control file: %m")));
3583
3584         close(fd);
3585
3586         /*
3587          * Check for expected pg_control format version.  If this is wrong, the
3588          * CRC check will likely fail because we'll be checking the wrong number
3589          * of bytes.  Complaining about wrong version will probably be more
3590          * enlightening than complaining about wrong CRC.
3591          */
3592
3593         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
3594                 ereport(FATAL,
3595                                 (errmsg("database files are incompatible with server"),
3596                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
3597                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
3598                         ControlFile->pg_control_version, ControlFile->pg_control_version,
3599                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
3600                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
3601
3602         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
3603                 ereport(FATAL,
3604                                 (errmsg("database files are incompatible with server"),
3605                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
3606                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
3607                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
3608                                  errhint("It looks like you need to initdb.")));
3609
3610         /* Now check the CRC. */
3611         INIT_CRC32(crc);
3612         COMP_CRC32(crc,
3613                            (char *) ControlFile,
3614                            offsetof(ControlFileData, crc));
3615         FIN_CRC32(crc);
3616
3617         if (!EQ_CRC32(crc, ControlFile->crc))
3618                 ereport(FATAL,
3619                                 (errmsg("incorrect checksum in control file")));
3620
3621         /*
3622          * Do compatibility checking immediately.  If the database isn't
3623          * compatible with the backend executable, we want to abort before we can
3624          * possibly do any damage.
3625          */
3626         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
3627                 ereport(FATAL,
3628                                 (errmsg("database files are incompatible with server"),
3629                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
3630                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
3631                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
3632                                  errhint("It looks like you need to initdb.")));
3633         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
3634                 ereport(FATAL,
3635                                 (errmsg("database files are incompatible with server"),
3636                    errdetail("The database cluster was initialized with MAXALIGN %d,"
3637                                          " but the server was compiled with MAXALIGN %d.",
3638                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
3639                                  errhint("It looks like you need to initdb.")));
3640         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
3641                 ereport(FATAL,
3642                                 (errmsg("database files are incompatible with server"),
3643                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
3644                                  errhint("It looks like you need to initdb.")));
3645         if (ControlFile->blcksz != BLCKSZ)
3646                 ereport(FATAL,
3647                                 (errmsg("database files are incompatible with server"),
3648                          errdetail("The database cluster was initialized with BLCKSZ %d,"
3649                                            " but the server was compiled with BLCKSZ %d.",
3650                                            ControlFile->blcksz, BLCKSZ),
3651                                  errhint("It looks like you need to recompile or initdb.")));
3652         if (ControlFile->relseg_size != RELSEG_SIZE)
3653                 ereport(FATAL,
3654                                 (errmsg("database files are incompatible with server"),
3655                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
3656                                   " but the server was compiled with RELSEG_SIZE %d.",
3657                                   ControlFile->relseg_size, RELSEG_SIZE),
3658                                  errhint("It looks like you need to recompile or initdb.")));
3659         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
3660                 ereport(FATAL,
3661                                 (errmsg("database files are incompatible with server"),
3662                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
3663                                   " but the server was compiled with XLOG_BLCKSZ %d.",
3664                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
3665                                  errhint("It looks like you need to recompile or initdb.")));
3666         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
3667                 ereport(FATAL,
3668                                 (errmsg("database files are incompatible with server"),
3669                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
3670                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
3671                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
3672                                  errhint("It looks like you need to recompile or initdb.")));
3673         if (ControlFile->nameDataLen != NAMEDATALEN)
3674                 ereport(FATAL,
3675                                 (errmsg("database files are incompatible with server"),
3676                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
3677                                   " but the server was compiled with NAMEDATALEN %d.",
3678                                   ControlFile->nameDataLen, NAMEDATALEN),
3679                                  errhint("It looks like you need to recompile or initdb.")));
3680         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
3681                 ereport(FATAL,
3682                                 (errmsg("database files are incompatible with server"),
3683                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
3684                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
3685                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
3686                                  errhint("It looks like you need to recompile or initdb.")));
3687         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
3688                 ereport(FATAL,
3689                                 (errmsg("database files are incompatible with server"),
3690                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
3691                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
3692                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
3693                                  errhint("It looks like you need to recompile or initdb.")));
3694
3695 #ifdef HAVE_INT64_TIMESTAMP
3696         if (ControlFile->enableIntTimes != true)
3697                 ereport(FATAL,
3698                                 (errmsg("database files are incompatible with server"),
3699                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
3700                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
3701                                  errhint("It looks like you need to recompile or initdb.")));
3702 #else
3703         if (ControlFile->enableIntTimes != false)
3704                 ereport(FATAL,
3705                                 (errmsg("database files are incompatible with server"),
3706                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
3707                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
3708                                  errhint("It looks like you need to recompile or initdb.")));
3709 #endif
3710
3711 #ifdef USE_FLOAT4_BYVAL
3712         if (ControlFile->float4ByVal != true)
3713                 ereport(FATAL,
3714                                 (errmsg("database files are incompatible with server"),
3715                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
3716                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
3717                                  errhint("It looks like you need to recompile or initdb.")));
3718 #else
3719         if (ControlFile->float4ByVal != false)
3720                 ereport(FATAL,
3721                                 (errmsg("database files are incompatible with server"),
3722                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
3723                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
3724                                  errhint("It looks like you need to recompile or initdb.")));
3725 #endif
3726
3727 #ifdef USE_FLOAT8_BYVAL
3728         if (ControlFile->float8ByVal != true)
3729                 ereport(FATAL,
3730                                 (errmsg("database files are incompatible with server"),
3731                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
3732                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
3733                                  errhint("It looks like you need to recompile or initdb.")));
3734 #else
3735         if (ControlFile->float8ByVal != false)
3736                 ereport(FATAL,
3737                                 (errmsg("database files are incompatible with server"),
3738                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
3739                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
3740                                  errhint("It looks like you need to recompile or initdb.")));
3741 #endif
3742 }
3743
3744 void
3745 UpdateControlFile(void)
3746 {
3747         int                     fd;
3748
3749         INIT_CRC32(ControlFile->crc);
3750         COMP_CRC32(ControlFile->crc,
3751                            (char *) ControlFile,
3752                            offsetof(ControlFileData, crc));
3753         FIN_CRC32(ControlFile->crc);
3754
3755         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3756                                            O_RDWR | PG_BINARY,
3757                                            S_IRUSR | S_IWUSR);
3758         if (fd < 0)
3759                 ereport(PANIC,
3760                                 (errcode_for_file_access(),
3761                                  errmsg("could not open control file \"%s\": %m",
3762                                                 XLOG_CONTROL_FILE)));
3763
3764         errno = 0;
3765         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
3766         {
3767                 /* if write didn't set errno, assume problem is no disk space */
3768                 if (errno == 0)
3769                         errno = ENOSPC;
3770                 ereport(PANIC,
3771                                 (errcode_for_file_access(),
3772                                  errmsg("could not write to control file: %m")));
3773         }
3774
3775         if (pg_fsync(fd) != 0)
3776                 ereport(PANIC,
3777                                 (errcode_for_file_access(),
3778                                  errmsg("could not fsync control file: %m")));
3779
3780         if (close(fd))
3781                 ereport(PANIC,
3782                                 (errcode_for_file_access(),
3783                                  errmsg("could not close control file: %m")));
3784 }
3785
3786 /*
3787  * Returns the unique system identifier from control file.
3788  */
3789 uint64
3790 GetSystemIdentifier(void)
3791 {
3792         Assert(ControlFile != NULL);
3793         return ControlFile->system_identifier;
3794 }
3795
3796 /*
3797  * Are checksums enabled for data pages?
3798  */
3799 bool
3800 DataChecksumsEnabled(void)
3801 {
3802         Assert(ControlFile != NULL);
3803         return (ControlFile->data_checksum_version > 0);
3804 }
3805
3806 /*
3807  * Returns a fake LSN for unlogged relations.
3808  *
3809  * Each call generates an LSN that is greater than any previous value
3810  * returned. The current counter value is saved and restored across clean
3811  * shutdowns, but like unlogged relations, does not survive a crash. This can
3812  * be used in lieu of real LSN values returned by XLogInsert, if you need an
3813  * LSN-like increasing sequence of numbers without writing any WAL.
3814  */
3815 XLogRecPtr
3816 GetFakeLSNForUnloggedRel(void)
3817 {
3818         XLogRecPtr      nextUnloggedLSN;
3819
3820         /* use volatile pointer to prevent code rearrangement */
3821         volatile XLogCtlData *xlogctl = XLogCtl;
3822
3823         /* increment the unloggedLSN counter, need SpinLock */
3824         SpinLockAcquire(&xlogctl->ulsn_lck);
3825         nextUnloggedLSN = xlogctl->unloggedLSN++;
3826         SpinLockRelease(&xlogctl->ulsn_lck);
3827
3828         return nextUnloggedLSN;
3829 }
3830
3831 /*
3832  * Auto-tune the number of XLOG buffers.
3833  *
3834  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
3835  * a maximum of one XLOG segment (there is little reason to think that more
3836  * is helpful, at least so long as we force an fsync when switching log files)
3837  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
3838  * 9.1, when auto-tuning was added).
3839  *
3840  * This should not be called until NBuffers has received its final value.
3841  */
3842 static int
3843 XLOGChooseNumBuffers(void)
3844 {
3845         int                     xbuffers;
3846
3847         xbuffers = NBuffers / 32;
3848         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
3849                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
3850         if (xbuffers < 8)
3851                 xbuffers = 8;
3852         return xbuffers;
3853 }
3854
3855 /*
3856  * GUC check_hook for wal_buffers
3857  */
3858 bool
3859 check_wal_buffers(int *newval, void **extra, GucSource source)
3860 {
3861         /*
3862          * -1 indicates a request for auto-tune.
3863          */
3864         if (*newval == -1)
3865         {
3866                 /*
3867                  * If we haven't yet changed the boot_val default of -1, just let it
3868                  * be.  We'll fix it when XLOGShmemSize is called.
3869                  */
3870                 if (XLOGbuffers == -1)
3871                         return true;
3872
3873                 /* Otherwise, substitute the auto-tune value */
3874                 *newval = XLOGChooseNumBuffers();
3875         }
3876
3877         /*
3878          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
3879          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
3880          * the case, we just silently treat such values as a request for the
3881          * minimum.  (We could throw an error instead, but that doesn't seem very
3882          * helpful.)
3883          */
3884         if (*newval < 4)
3885                 *newval = 4;
3886
3887         return true;
3888 }
3889
3890 /*
3891  * Initialization of shared memory for XLOG
3892  */
3893 Size
3894 XLOGShmemSize(void)
3895 {
3896         Size            size;
3897
3898         /*
3899          * If the value of wal_buffers is -1, use the preferred auto-tune value.
3900          * This isn't an amazingly clean place to do this, but we must wait till
3901          * NBuffers has received its final value, and must do it before using the
3902          * value of XLOGbuffers to do anything important.
3903          */
3904         if (XLOGbuffers == -1)
3905         {
3906                 char            buf[32];
3907
3908                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
3909                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
3910         }
3911         Assert(XLOGbuffers > 0);
3912
3913         /* XLogCtl */
3914         size = sizeof(XLogCtlData);
3915         /* xlblocks array */
3916         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
3917         /* extra alignment padding for XLOG I/O buffers */
3918         size = add_size(size, ALIGNOF_XLOG_BUFFER);
3919         /* and the buffers themselves */
3920         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
3921
3922         /*
3923          * Note: we don't count ControlFileData, it comes out of the "slop factor"
3924          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
3925          * routine again below to compute the actual allocation size.
3926          */
3927
3928         return size;
3929 }
3930
3931 void
3932 XLOGShmemInit(void)
3933 {
3934         bool            foundCFile,
3935                                 foundXLog;
3936         char       *allocptr;
3937
3938         ControlFile = (ControlFileData *)
3939                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
3940         XLogCtl = (XLogCtlData *)
3941                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
3942
3943         if (foundCFile || foundXLog)
3944         {
3945                 /* both should be present or neither */
3946                 Assert(foundCFile && foundXLog);
3947                 return;
3948         }
3949
3950         memset(XLogCtl, 0, sizeof(XLogCtlData));
3951
3952         /*
3953          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
3954          * multiple of the alignment for same, so no extra alignment padding is
3955          * needed here.
3956          */
3957         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
3958         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
3959         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
3960         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
3961
3962         /*
3963          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
3964          */
3965         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
3966         XLogCtl->pages = allocptr;
3967         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
3968
3969         /*
3970          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
3971          * in additional info.)
3972          */
3973         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
3974         XLogCtl->SharedRecoveryInProgress = true;
3975         XLogCtl->SharedHotStandbyActive = false;
3976         XLogCtl->WalWriterSleeping = false;
3977         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
3978         SpinLockInit(&XLogCtl->info_lck);
3979         SpinLockInit(&XLogCtl->ulsn_lck);
3980         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
3981
3982         /*
3983          * If we are not in bootstrap mode, pg_control should already exist. Read
3984          * and validate it immediately (see comments in ReadControlFile() for the
3985          * reasons why).
3986          */
3987         if (!IsBootstrapProcessingMode())
3988                 ReadControlFile();
3989 }
3990
3991 /*
3992  * This func must be called ONCE on system install.  It creates pg_control
3993  * and the initial XLOG segment.
3994  */
3995 void
3996 BootStrapXLOG(void)
3997 {
3998         CheckPoint      checkPoint;
3999         char       *buffer;
4000         XLogPageHeader page;
4001         XLogLongPageHeader longpage;
4002         XLogRecord *record;
4003         bool            use_existent;
4004         uint64          sysidentifier;
4005         struct timeval tv;
4006         pg_crc32        crc;
4007
4008         /*
4009          * Select a hopefully-unique system identifier code for this installation.
4010          * We use the result of gettimeofday(), including the fractional seconds
4011          * field, as being about as unique as we can easily get.  (Think not to
4012          * use random(), since it hasn't been seeded and there's no portable way
4013          * to seed it other than the system clock value...)  The upper half of the
4014          * uint64 value is just the tv_sec part, while the lower half is the XOR
4015          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4016          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4017          * knowing this encoding can determine the initialization time of the
4018          * installation, which could perhaps be useful sometimes.
4019          */
4020         gettimeofday(&tv, NULL);
4021         sysidentifier = ((uint64) tv.tv_sec) << 32;
4022         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4023
4024         /* First timeline ID is always 1 */
4025         ThisTimeLineID = 1;
4026
4027         /* page buffer must be aligned suitably for O_DIRECT */
4028         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4029         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4030         memset(page, 0, XLOG_BLCKSZ);
4031
4032         /*
4033          * Set up information for the initial checkpoint record
4034          *
4035          * The initial checkpoint record is written to the beginning of the WAL
4036          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4037          * used, so that we can use 0/0 to mean "before any valid WAL segment".
4038          */
4039         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
4040         checkPoint.ThisTimeLineID = ThisTimeLineID;
4041         checkPoint.PrevTimeLineID = ThisTimeLineID;
4042         checkPoint.fullPageWrites = fullPageWrites;
4043         checkPoint.nextXidEpoch = 0;
4044         checkPoint.nextXid = FirstNormalTransactionId;
4045         checkPoint.nextOid = FirstBootstrapObjectId;
4046         checkPoint.nextMulti = FirstMultiXactId;
4047         checkPoint.nextMultiOffset = 0;
4048         checkPoint.oldestXid = FirstNormalTransactionId;
4049         checkPoint.oldestXidDB = TemplateDbOid;
4050         checkPoint.oldestMulti = FirstMultiXactId;
4051         checkPoint.oldestMultiDB = TemplateDbOid;
4052         checkPoint.time = (pg_time_t) time(NULL);
4053         checkPoint.oldestActiveXid = InvalidTransactionId;
4054
4055         ShmemVariableCache->nextXid = checkPoint.nextXid;
4056         ShmemVariableCache->nextOid = checkPoint.nextOid;
4057         ShmemVariableCache->oidCount = 0;
4058         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4059         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4060         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
4061
4062         /* Set up the XLOG page header */
4063         page->xlp_magic = XLOG_PAGE_MAGIC;
4064         page->xlp_info = XLP_LONG_HEADER;
4065         page->xlp_tli = ThisTimeLineID;
4066         page->xlp_pageaddr = XLogSegSize;
4067         longpage = (XLogLongPageHeader) page;
4068         longpage->xlp_sysid = sysidentifier;
4069         longpage->xlp_seg_size = XLogSegSize;
4070         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4071
4072         /* Insert the initial checkpoint record */
4073         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4074         record->xl_prev = 0;
4075         record->xl_xid = InvalidTransactionId;
4076         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4077         record->xl_len = sizeof(checkPoint);
4078         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4079         record->xl_rmid = RM_XLOG_ID;
4080         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4081
4082         INIT_CRC32(crc);
4083         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4084         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
4085         FIN_CRC32(crc);
4086         record->xl_crc = crc;
4087
4088         /* Create first XLOG segment file */
4089         use_existent = false;
4090         openLogFile = XLogFileInit(1, &use_existent, false);
4091
4092         /* Write the first page with the initial record */
4093         errno = 0;
4094         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4095         {
4096                 /* if write didn't set errno, assume problem is no disk space */
4097                 if (errno == 0)
4098                         errno = ENOSPC;
4099                 ereport(PANIC,
4100                                 (errcode_for_file_access(),
4101                           errmsg("could not write bootstrap transaction log file: %m")));
4102         }
4103
4104         if (pg_fsync(openLogFile) != 0)
4105                 ereport(PANIC,
4106                                 (errcode_for_file_access(),
4107                           errmsg("could not fsync bootstrap transaction log file: %m")));
4108
4109         if (close(openLogFile))
4110                 ereport(PANIC,
4111                                 (errcode_for_file_access(),
4112                           errmsg("could not close bootstrap transaction log file: %m")));
4113
4114         openLogFile = -1;
4115
4116         /* Now create pg_control */
4117
4118         memset(ControlFile, 0, sizeof(ControlFileData));
4119         /* Initialize pg_control status fields */
4120         ControlFile->system_identifier = sysidentifier;
4121         ControlFile->state = DB_SHUTDOWNED;
4122         ControlFile->time = checkPoint.time;
4123         ControlFile->checkPoint = checkPoint.redo;
4124         ControlFile->checkPointCopy = checkPoint;
4125         ControlFile->unloggedLSN = 1;
4126
4127         /* Set important parameter values for use when replaying WAL */
4128         ControlFile->MaxConnections = MaxConnections;
4129         ControlFile->max_prepared_xacts = max_prepared_xacts;
4130         ControlFile->max_locks_per_xact = max_locks_per_xact;
4131         ControlFile->wal_level = wal_level;
4132         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
4133
4134         /* some additional ControlFile fields are set in WriteControlFile() */
4135
4136         WriteControlFile();
4137
4138         /* Bootstrap the commit log, too */
4139         BootStrapCLOG();
4140         BootStrapSUBTRANS();
4141         BootStrapMultiXact();
4142
4143         pfree(buffer);
4144 }
4145
4146 static char *
4147 str_time(pg_time_t tnow)
4148 {
4149         static char buf[128];
4150
4151         pg_strftime(buf, sizeof(buf),
4152                                 "%Y-%m-%d %H:%M:%S %Z",
4153                                 pg_localtime(&tnow, log_timezone));
4154
4155         return buf;
4156 }
4157
4158 /*
4159  * See if there is a recovery command file (recovery.conf), and if so
4160  * read in parameters for archive recovery and XLOG streaming.
4161  *
4162  * The file is parsed using the main configuration parser.
4163  */
4164 static void
4165 readRecoveryCommandFile(void)
4166 {
4167         FILE       *fd;
4168         TimeLineID      rtli = 0;
4169         bool            rtliGiven = false;
4170         ConfigVariable *item,
4171                            *head = NULL,
4172                            *tail = NULL;
4173
4174         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4175         if (fd == NULL)
4176         {
4177                 if (errno == ENOENT)
4178                         return;                         /* not there, so no archive recovery */
4179                 ereport(FATAL,
4180                                 (errcode_for_file_access(),
4181                                  errmsg("could not open recovery command file \"%s\": %m",
4182                                                 RECOVERY_COMMAND_FILE)));
4183         }
4184
4185         /*
4186          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
4187          * no need to check the return value.
4188          */
4189         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
4190
4191         FreeFile(fd);
4192
4193         for (item = head; item; item = item->next)
4194         {
4195                 if (strcmp(item->name, "restore_command") == 0)
4196                 {
4197                         recoveryRestoreCommand = pstrdup(item->value);
4198                         ereport(DEBUG2,
4199                                         (errmsg_internal("restore_command = '%s'",
4200                                                                          recoveryRestoreCommand)));
4201                 }
4202                 else if (strcmp(item->name, "recovery_end_command") == 0)
4203                 {
4204                         recoveryEndCommand = pstrdup(item->value);
4205                         ereport(DEBUG2,
4206                                         (errmsg_internal("recovery_end_command = '%s'",
4207                                                                          recoveryEndCommand)));
4208                 }
4209                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
4210                 {
4211                         archiveCleanupCommand = pstrdup(item->value);
4212                         ereport(DEBUG2,
4213                                         (errmsg_internal("archive_cleanup_command = '%s'",
4214                                                                          archiveCleanupCommand)));
4215                 }
4216                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
4217                 {
4218                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
4219                                 ereport(ERROR,
4220                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4221                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
4222                         ereport(DEBUG2,
4223                                         (errmsg_internal("pause_at_recovery_target = '%s'",
4224                                                                          item->value)));
4225                 }
4226                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
4227                 {
4228                         rtliGiven = true;
4229                         if (strcmp(item->value, "latest") == 0)
4230                                 rtli = 0;
4231                         else
4232                         {
4233                                 errno = 0;
4234                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
4235                                 if (errno == EINVAL || errno == ERANGE)
4236                                         ereport(FATAL,
4237                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4238                                                                         item->value)));
4239                         }
4240                         if (rtli)
4241                                 ereport(DEBUG2,
4242                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
4243                         else
4244                                 ereport(DEBUG2,
4245                                          (errmsg_internal("recovery_target_timeline = latest")));
4246                 }
4247                 else if (strcmp(item->name, "recovery_target_xid") == 0)
4248                 {
4249                         errno = 0;
4250                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
4251                         if (errno == EINVAL || errno == ERANGE)
4252                                 ereport(FATAL,
4253                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4254                                                  item->value)));
4255                         ereport(DEBUG2,
4256                                         (errmsg_internal("recovery_target_xid = %u",
4257                                                                          recoveryTargetXid)));
4258                         recoveryTarget = RECOVERY_TARGET_XID;
4259                 }
4260                 else if (strcmp(item->name, "recovery_target_time") == 0)
4261                 {
4262                         /*
4263                          * if recovery_target_xid or recovery_target_name specified, then
4264                          * this overrides recovery_target_time
4265                          */
4266                         if (recoveryTarget == RECOVERY_TARGET_XID ||
4267                                 recoveryTarget == RECOVERY_TARGET_NAME)
4268                                 continue;
4269                         recoveryTarget = RECOVERY_TARGET_TIME;
4270
4271                         /*
4272                          * Convert the time string given by the user to TimestampTz form.
4273                          */
4274                         recoveryTargetTime =
4275                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4276                                                                                                 CStringGetDatum(item->value),
4277                                                                                                 ObjectIdGetDatum(InvalidOid),
4278                                                                                                                 Int32GetDatum(-1)));
4279                         ereport(DEBUG2,
4280                                         (errmsg_internal("recovery_target_time = '%s'",
4281                                                                    timestamptz_to_str(recoveryTargetTime))));
4282                 }
4283                 else if (strcmp(item->name, "recovery_target_name") == 0)
4284                 {
4285                         /*
4286                          * if recovery_target_xid specified, then this overrides
4287                          * recovery_target_name
4288                          */
4289                         if (recoveryTarget == RECOVERY_TARGET_XID)
4290                                 continue;
4291                         recoveryTarget = RECOVERY_TARGET_NAME;
4292
4293                         recoveryTargetName = pstrdup(item->value);
4294                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
4295                                 ereport(FATAL,
4296                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4297                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
4298                                                                 MAXFNAMELEN - 1)));
4299
4300                         ereport(DEBUG2,
4301                                         (errmsg_internal("recovery_target_name = '%s'",
4302                                                                          recoveryTargetName)));
4303                 }
4304                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
4305                 {
4306                         /*
4307                          * does nothing if a recovery_target is not also set
4308                          */
4309                         if (!parse_bool(item->value, &recoveryTargetInclusive))
4310                                 ereport(ERROR,
4311                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4312                                                  errmsg("parameter \"%s\" requires a Boolean value",
4313                                                                 "recovery_target_inclusive")));
4314                         ereport(DEBUG2,
4315                                         (errmsg_internal("recovery_target_inclusive = %s",
4316                                                                          item->value)));
4317                 }
4318                 else if (strcmp(item->name, "standby_mode") == 0)
4319                 {
4320                         if (!parse_bool(item->value, &StandbyModeRequested))
4321                                 ereport(ERROR,
4322                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4323                                                  errmsg("parameter \"%s\" requires a Boolean value",
4324                                                                 "standby_mode")));
4325                         ereport(DEBUG2,
4326                                         (errmsg_internal("standby_mode = '%s'", item->value)));
4327                 }
4328                 else if (strcmp(item->name, "primary_conninfo") == 0)
4329                 {
4330                         PrimaryConnInfo = pstrdup(item->value);
4331                         ereport(DEBUG2,
4332                                         (errmsg_internal("primary_conninfo = '%s'",
4333                                                                          PrimaryConnInfo)));
4334                 }
4335                 else if (strcmp(item->name, "trigger_file") == 0)
4336                 {
4337                         TriggerFile = pstrdup(item->value);
4338                         ereport(DEBUG2,
4339                                         (errmsg_internal("trigger_file = '%s'",
4340                                                                          TriggerFile)));
4341                 }
4342                 else
4343                         ereport(FATAL,
4344                                         (errmsg("unrecognized recovery parameter \"%s\"",
4345                                                         item->name)));
4346         }
4347
4348         /*
4349          * Check for compulsory parameters
4350          */
4351         if (StandbyModeRequested)
4352         {
4353                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
4354                         ereport(WARNING,
4355                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
4356                                                         RECOVERY_COMMAND_FILE),
4357                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
4358         }
4359         else
4360         {
4361                 if (recoveryRestoreCommand == NULL)
4362                         ereport(FATAL,
4363                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
4364                                                         RECOVERY_COMMAND_FILE)));
4365         }
4366
4367         /* Enable fetching from archive recovery area */
4368         ArchiveRecoveryRequested = true;
4369
4370         /*
4371          * If user specified recovery_target_timeline, validate it or compute the
4372          * "latest" value.      We can't do this until after we've gotten the restore
4373          * command and set InArchiveRecovery, because we need to fetch timeline
4374          * history files from the archive.
4375          */
4376         if (rtliGiven)
4377         {
4378                 if (rtli)
4379                 {
4380                         /* Timeline 1 does not have a history file, all else should */
4381                         if (rtli != 1 && !existsTimeLineHistory(rtli))
4382                                 ereport(FATAL,
4383                                                 (errmsg("recovery target timeline %u does not exist",
4384                                                                 rtli)));
4385                         recoveryTargetTLI = rtli;
4386                         recoveryTargetIsLatest = false;
4387                 }
4388                 else
4389                 {
4390                         /* We start the "latest" search from pg_control's timeline */
4391                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
4392                         recoveryTargetIsLatest = true;
4393                 }
4394         }
4395
4396         FreeConfigVariables(head);
4397 }
4398
4399 /*
4400  * Exit archive-recovery state
4401  */
4402 static void
4403 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
4404 {
4405         char            recoveryPath[MAXPGPATH];
4406         char            xlogpath[MAXPGPATH];
4407
4408         /*
4409          * We are no longer in archive recovery state.
4410          */
4411         InArchiveRecovery = false;
4412
4413         /*
4414          * Update min recovery point one last time.
4415          */
4416         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
4417
4418         /*
4419          * If the ending log segment is still open, close it (to avoid problems on
4420          * Windows with trying to rename or delete an open file).
4421          */
4422         if (readFile >= 0)
4423         {
4424                 close(readFile);
4425                 readFile = -1;
4426         }
4427
4428         /*
4429          * If we are establishing a new timeline, we have to copy data from the
4430          * last WAL segment of the old timeline to create a starting WAL segment
4431          * for the new timeline.
4432          *
4433          * Notify the archiver that the last WAL segment of the old timeline is
4434          * ready to copy to archival storage. Otherwise, it is not archived for a
4435          * while.
4436          */
4437         if (endTLI != ThisTimeLineID)
4438         {
4439                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
4440
4441                 if (XLogArchivingActive())
4442                 {
4443                         XLogFileName(xlogpath, endTLI, endLogSegNo);
4444                         XLogArchiveNotify(xlogpath);
4445                 }
4446         }
4447
4448         /*
4449          * Let's just make real sure there are not .ready or .done flags posted
4450          * for the new segment.
4451          */
4452         XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
4453         XLogArchiveCleanup(xlogpath);
4454
4455         /*
4456          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
4457          * of it.
4458          */
4459         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
4460         unlink(recoveryPath);           /* ignore any error */
4461
4462         /* Get rid of any remaining recovered timeline-history file, too */
4463         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
4464         unlink(recoveryPath);           /* ignore any error */
4465
4466         /*
4467          * Rename the config file out of the way, so that we don't accidentally
4468          * re-enter archive recovery mode in a subsequent crash.
4469          */
4470         unlink(RECOVERY_COMMAND_DONE);
4471         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
4472                 ereport(FATAL,
4473                                 (errcode_for_file_access(),
4474                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4475                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
4476
4477         ereport(LOG,
4478                         (errmsg("archive recovery complete")));
4479 }
4480
4481 /*
4482  * For point-in-time recovery, this function decides whether we want to
4483  * stop applying the XLOG at or after the current record.
4484  *
4485  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
4486  * *includeThis is set TRUE if we should apply this record before stopping.
4487  *
4488  * We also track the timestamp of the latest applied COMMIT/ABORT
4489  * record in XLogCtl->recoveryLastXTime, for logging purposes.
4490  * Also, some information is saved in recoveryStopXid et al for use in
4491  * annotating the new timeline's history file.
4492  */
4493 static bool
4494 recoveryStopsHere(XLogRecord *record, bool *includeThis)
4495 {
4496         bool            stopsHere;
4497         uint8           record_info;
4498         TimestampTz recordXtime;
4499         char            recordRPName[MAXFNAMELEN];
4500
4501         /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
4502         if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
4503                 return false;
4504         record_info = record->xl_info & ~XLR_INFO_MASK;
4505         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
4506         {
4507                 xl_xact_commit_compact *recordXactCommitData;
4508
4509                 recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
4510                 recordXtime = recordXactCommitData->xact_time;
4511         }
4512         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
4513         {
4514                 xl_xact_commit *recordXactCommitData;
4515
4516                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
4517                 recordXtime = recordXactCommitData->xact_time;
4518         }
4519         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
4520         {
4521                 xl_xact_abort *recordXactAbortData;
4522
4523                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
4524                 recordXtime = recordXactAbortData->xact_time;
4525         }
4526         else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
4527         {
4528                 xl_restore_point *recordRestorePointData;
4529
4530                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
4531                 recordXtime = recordRestorePointData->rp_time;
4532                 strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
4533         }
4534         else
4535                 return false;
4536
4537         /* Do we have a PITR target at all? */
4538         if (recoveryTarget == RECOVERY_TARGET_UNSET)
4539         {
4540                 /*
4541                  * Save timestamp of latest transaction commit/abort if this is a
4542                  * transaction record
4543                  */
4544                 if (record->xl_rmid == RM_XACT_ID)
4545                         SetLatestXTime(recordXtime);
4546                 return false;
4547         }
4548
4549         if (recoveryTarget == RECOVERY_TARGET_XID)
4550         {
4551                 /*
4552                  * There can be only one transaction end record with this exact
4553                  * transactionid
4554                  *
4555                  * when testing for an xid, we MUST test for equality only, since
4556                  * transactions are numbered in the order they start, not the order
4557                  * they complete. A higher numbered xid will complete before you about
4558                  * 50% of the time...
4559                  */
4560                 stopsHere = (record->xl_xid == recoveryTargetXid);
4561                 if (stopsHere)
4562                         *includeThis = recoveryTargetInclusive;
4563         }
4564         else if (recoveryTarget == RECOVERY_TARGET_NAME)
4565         {
4566                 /*
4567                  * There can be many restore points that share the same name, so we
4568                  * stop at the first one
4569                  */
4570                 stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
4571
4572                 /*
4573                  * Ignore recoveryTargetInclusive because this is not a transaction
4574                  * record
4575                  */
4576                 *includeThis = false;
4577         }
4578         else
4579         {
4580                 /*
4581                  * There can be many transactions that share the same commit time, so
4582                  * we stop after the last one, if we are inclusive, or stop at the
4583                  * first one if we are exclusive
4584                  */
4585                 if (recoveryTargetInclusive)
4586                         stopsHere = (recordXtime > recoveryTargetTime);
4587                 else
4588                         stopsHere = (recordXtime >= recoveryTargetTime);
4589                 if (stopsHere)
4590                         *includeThis = false;
4591         }
4592
4593         if (stopsHere)
4594         {
4595                 recoveryStopXid = record->xl_xid;
4596                 recoveryStopTime = recordXtime;
4597                 recoveryStopAfter = *includeThis;
4598
4599                 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
4600                 {
4601                         if (recoveryStopAfter)
4602                                 ereport(LOG,
4603                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
4604                                                                 recoveryStopXid,
4605                                                                 timestamptz_to_str(recoveryStopTime))));
4606                         else
4607                                 ereport(LOG,
4608                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
4609                                                                 recoveryStopXid,
4610                                                                 timestamptz_to_str(recoveryStopTime))));
4611                 }
4612                 else if (record_info == XLOG_XACT_ABORT)
4613                 {
4614                         if (recoveryStopAfter)
4615                                 ereport(LOG,
4616                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
4617                                                                 recoveryStopXid,
4618                                                                 timestamptz_to_str(recoveryStopTime))));
4619                         else
4620                                 ereport(LOG,
4621                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
4622                                                                 recoveryStopXid,
4623                                                                 timestamptz_to_str(recoveryStopTime))));
4624                 }
4625                 else
4626                 {
4627                         strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
4628
4629                         ereport(LOG,
4630                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
4631                                                 recoveryStopName,
4632                                                 timestamptz_to_str(recoveryStopTime))));
4633                 }
4634
4635                 /*
4636                  * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
4637                  * restore point since they are timestamped, though the latest
4638                  * transaction time is not updated.
4639                  */
4640                 if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
4641                         SetLatestXTime(recordXtime);
4642         }
4643         else if (record->xl_rmid == RM_XACT_ID)
4644                 SetLatestXTime(recordXtime);
4645
4646         return stopsHere;
4647 }
4648
4649 /*
4650  * Wait until shared recoveryPause flag is cleared.
4651  *
4652  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
4653  * Probably not worth the trouble though.  This state shouldn't be one that
4654  * anyone cares about server power consumption in.
4655  */
4656 static void
4657 recoveryPausesHere(void)
4658 {
4659         /* Don't pause unless users can connect! */
4660         if (!LocalHotStandbyActive)
4661                 return;
4662
4663         ereport(LOG,
4664                         (errmsg("recovery has paused"),
4665                          errhint("Execute pg_xlog_replay_resume() to continue.")));
4666
4667         while (RecoveryIsPaused())
4668         {
4669                 pg_usleep(1000000L);    /* 1000 ms */
4670                 HandleStartupProcInterrupts();
4671         }
4672 }
4673
4674 bool
4675 RecoveryIsPaused(void)
4676 {
4677         /* use volatile pointer to prevent code rearrangement */
4678         volatile XLogCtlData *xlogctl = XLogCtl;
4679         bool            recoveryPause;
4680
4681         SpinLockAcquire(&xlogctl->info_lck);
4682         recoveryPause = xlogctl->recoveryPause;
4683         SpinLockRelease(&xlogctl->info_lck);
4684
4685         return recoveryPause;
4686 }
4687
4688 void
4689 SetRecoveryPause(bool recoveryPause)
4690 {
4691         /* use volatile pointer to prevent code rearrangement */
4692         volatile XLogCtlData *xlogctl = XLogCtl;
4693
4694         SpinLockAcquire(&xlogctl->info_lck);
4695         xlogctl->recoveryPause = recoveryPause;
4696         SpinLockRelease(&xlogctl->info_lck);
4697 }
4698
4699 /*
4700  * Save timestamp of latest processed commit/abort record.
4701  *
4702  * We keep this in XLogCtl, not a simple static variable, so that it can be
4703  * seen by processes other than the startup process.  Note in particular
4704  * that CreateRestartPoint is executed in the checkpointer.
4705  */
4706 static void
4707 SetLatestXTime(TimestampTz xtime)
4708 {
4709         /* use volatile pointer to prevent code rearrangement */
4710         volatile XLogCtlData *xlogctl = XLogCtl;
4711
4712         SpinLockAcquire(&xlogctl->info_lck);
4713         xlogctl->recoveryLastXTime = xtime;
4714         SpinLockRelease(&xlogctl->info_lck);
4715 }
4716
4717 /*
4718  * Fetch timestamp of latest processed commit/abort record.
4719  */
4720 TimestampTz
4721 GetLatestXTime(void)
4722 {
4723         /* use volatile pointer to prevent code rearrangement */
4724         volatile XLogCtlData *xlogctl = XLogCtl;
4725         TimestampTz xtime;
4726
4727         SpinLockAcquire(&xlogctl->info_lck);
4728         xtime = xlogctl->recoveryLastXTime;
4729         SpinLockRelease(&xlogctl->info_lck);
4730
4731         return xtime;
4732 }
4733
4734 /*
4735  * Save timestamp of the next chunk of WAL records to apply.
4736  *
4737  * We keep this in XLogCtl, not a simple static variable, so that it can be
4738  * seen by all backends.
4739  */
4740 static void
4741 SetCurrentChunkStartTime(TimestampTz xtime)
4742 {
4743         /* use volatile pointer to prevent code rearrangement */
4744         volatile XLogCtlData *xlogctl = XLogCtl;
4745
4746         SpinLockAcquire(&xlogctl->info_lck);
4747         xlogctl->currentChunkStartTime = xtime;
4748         SpinLockRelease(&xlogctl->info_lck);
4749 }
4750
4751 /*
4752  * Fetch timestamp of latest processed commit/abort record.
4753  * Startup process maintains an accurate local copy in XLogReceiptTime
4754  */
4755 TimestampTz
4756 GetCurrentChunkReplayStartTime(void)
4757 {
4758         /* use volatile pointer to prevent code rearrangement */
4759         volatile XLogCtlData *xlogctl = XLogCtl;
4760         TimestampTz xtime;
4761
4762         SpinLockAcquire(&xlogctl->info_lck);
4763         xtime = xlogctl->currentChunkStartTime;
4764         SpinLockRelease(&xlogctl->info_lck);
4765
4766         return xtime;
4767 }
4768
4769 /*
4770  * Returns time of receipt of current chunk of XLOG data, as well as
4771  * whether it was received from streaming replication or from archives.
4772  */
4773 void
4774 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4775 {
4776         /*
4777          * This must be executed in the startup process, since we don't export the
4778          * relevant state to shared memory.
4779          */
4780         Assert(InRecovery);
4781
4782         *rtime = XLogReceiptTime;
4783         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4784 }
4785
4786 /*
4787  * Note that text field supplied is a parameter name and does not require
4788  * translation
4789  */
4790 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
4791 do { \
4792         if ((currValue) < (minValue)) \
4793                 ereport(ERROR, \
4794                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
4795                                  errmsg("hot standby is not possible because " \
4796                                                 "%s = %d is a lower setting than on the master server " \
4797                                                 "(its value was %d)", \
4798                                                 param_name, \
4799                                                 currValue, \
4800                                                 minValue))); \
4801 } while(0)
4802
4803 /*
4804  * Check to see if required parameters are set high enough on this server
4805  * for various aspects of recovery operation.
4806  */
4807 static void
4808 CheckRequiredParameterValues(void)
4809 {
4810         /*
4811          * For archive recovery, the WAL must be generated with at least 'archive'
4812          * wal_level.
4813          */
4814         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
4815         {
4816                 ereport(WARNING,
4817                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
4818                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
4819         }
4820
4821         /*
4822          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
4823          * we must have at least as many backend slots as the primary.
4824          */
4825         if (InArchiveRecovery && EnableHotStandby)
4826         {
4827                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
4828                         ereport(ERROR,
4829                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
4830                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
4831
4832                 /* We ignore autovacuum_max_workers when we make this test. */
4833                 RecoveryRequiresIntParameter("max_connections",
4834                                                                          MaxConnections,
4835                                                                          ControlFile->MaxConnections);
4836                 RecoveryRequiresIntParameter("max_prepared_transactions",
4837                                                                          max_prepared_xacts,
4838                                                                          ControlFile->max_prepared_xacts);
4839                 RecoveryRequiresIntParameter("max_locks_per_transaction",
4840                                                                          max_locks_per_xact,
4841                                                                          ControlFile->max_locks_per_xact);
4842         }
4843 }
4844
4845 /*
4846  * This must be called ONCE during postmaster or standalone-backend startup
4847  */
4848 void
4849 StartupXLOG(void)
4850 {
4851         XLogCtlInsert *Insert;
4852         CheckPoint      checkPoint;
4853         bool            wasShutdown;
4854         bool            reachedStopPoint = false;
4855         bool            haveBackupLabel = false;
4856         XLogRecPtr      RecPtr,
4857                                 checkPointLoc,
4858                                 EndOfLog;
4859         XLogSegNo       endLogSegNo;
4860         TimeLineID      PrevTimeLineID;
4861         XLogRecord *record;
4862         uint32          freespace;
4863         TransactionId oldestActiveXID;
4864         bool            backupEndRequired = false;
4865         bool            backupFromStandby = false;
4866         DBState         dbstate_at_startup;
4867         XLogReaderState *xlogreader;
4868         XLogPageReadPrivate private;
4869         bool            fast_promoted = false;
4870
4871         /*
4872          * Read control file and check XLOG status looks valid.
4873          *
4874          * Note: in most control paths, *ControlFile is already valid and we need
4875          * not do ReadControlFile() here, but might as well do it to be sure.
4876          */
4877         ReadControlFile();
4878
4879         if (ControlFile->state < DB_SHUTDOWNED ||
4880                 ControlFile->state > DB_IN_PRODUCTION ||
4881                 !XRecOffIsValid(ControlFile->checkPoint))
4882                 ereport(FATAL,
4883                                 (errmsg("control file contains invalid data")));
4884
4885         if (ControlFile->state == DB_SHUTDOWNED)
4886                 ereport(LOG,
4887                                 (errmsg("database system was shut down at %s",
4888                                                 str_time(ControlFile->time))));
4889         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
4890                 ereport(LOG,
4891                                 (errmsg("database system was shut down in recovery at %s",
4892                                                 str_time(ControlFile->time))));
4893         else if (ControlFile->state == DB_SHUTDOWNING)
4894                 ereport(LOG,
4895                                 (errmsg("database system shutdown was interrupted; last known up at %s",
4896                                                 str_time(ControlFile->time))));
4897         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
4898                 ereport(LOG,
4899                    (errmsg("database system was interrupted while in recovery at %s",
4900                                    str_time(ControlFile->time)),
4901                         errhint("This probably means that some data is corrupted and"
4902                                         " you will have to use the last backup for recovery.")));
4903         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
4904                 ereport(LOG,
4905                                 (errmsg("database system was interrupted while in recovery at log time %s",
4906                                                 str_time(ControlFile->checkPointCopy.time)),
4907                                  errhint("If this has occurred more than once some data might be corrupted"
4908                           " and you might need to choose an earlier recovery target.")));
4909         else if (ControlFile->state == DB_IN_PRODUCTION)
4910                 ereport(LOG,
4911                           (errmsg("database system was interrupted; last known up at %s",
4912                                           str_time(ControlFile->time))));
4913
4914         /* This is just to allow attaching to startup process with a debugger */
4915 #ifdef XLOG_REPLAY_DELAY
4916         if (ControlFile->state != DB_SHUTDOWNED)
4917                 pg_usleep(60000000L);
4918 #endif
4919
4920         /*
4921          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
4922          * someone has performed a copy for PITR, these directories may have been
4923          * excluded and need to be re-created.
4924          */
4925         ValidateXLOGDirectoryStructure();
4926
4927         /*
4928          * Clear out any old relcache cache files.      This is *necessary* if we do
4929          * any WAL replay, since that would probably result in the cache files
4930          * being out of sync with database reality.  In theory we could leave them
4931          * in place if the database had been cleanly shut down, but it seems
4932          * safest to just remove them always and let them be rebuilt during the
4933          * first backend startup.
4934          */
4935         RelationCacheInitFileRemove();
4936
4937         /*
4938          * Initialize on the assumption we want to recover to the latest timeline
4939          * that's active according to pg_control.
4940          */
4941         if (ControlFile->minRecoveryPointTLI >
4942                 ControlFile->checkPointCopy.ThisTimeLineID)
4943                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
4944         else
4945                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
4946
4947         /*
4948          * Check for recovery control file, and if so set up state for offline
4949          * recovery
4950          */
4951         readRecoveryCommandFile();
4952
4953         /*
4954          * Save archive_cleanup_command in shared memory so that other processes
4955          * can see it.
4956          */
4957         strncpy(XLogCtl->archiveCleanupCommand,
4958                         archiveCleanupCommand ? archiveCleanupCommand : "",
4959                         sizeof(XLogCtl->archiveCleanupCommand));
4960
4961         if (ArchiveRecoveryRequested)
4962         {
4963                 if (StandbyModeRequested)
4964                         ereport(LOG,
4965                                         (errmsg("entering standby mode")));
4966                 else if (recoveryTarget == RECOVERY_TARGET_XID)
4967                         ereport(LOG,
4968                                         (errmsg("starting point-in-time recovery to XID %u",
4969                                                         recoveryTargetXid)));
4970                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
4971                         ereport(LOG,
4972                                         (errmsg("starting point-in-time recovery to %s",
4973                                                         timestamptz_to_str(recoveryTargetTime))));
4974                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
4975                         ereport(LOG,
4976                                         (errmsg("starting point-in-time recovery to \"%s\"",
4977                                                         recoveryTargetName)));
4978                 else
4979                         ereport(LOG,
4980                                         (errmsg("starting archive recovery")));
4981         }
4982
4983         /*
4984          * Take ownership of the wakeup latch if we're going to sleep during
4985          * recovery.
4986          */
4987         if (StandbyModeRequested)
4988                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
4989
4990         /* Set up XLOG reader facility */
4991         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
4992         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
4993         if (!xlogreader)
4994                 ereport(ERROR,
4995                                 (errcode(ERRCODE_OUT_OF_MEMORY),
4996                                  errmsg("out of memory"),
4997                         errdetail("Failed while allocating an XLog reading processor")));
4998         xlogreader->system_identifier = ControlFile->system_identifier;
4999
5000         if (read_backup_label(&checkPointLoc, &backupEndRequired,
5001                                                   &backupFromStandby))
5002         {
5003                 /*
5004                  * Archive recovery was requested, and thanks to the backup label
5005                  * file, we know how far we need to replay to reach consistency. Enter
5006                  * archive recovery directly.
5007                  */
5008                 InArchiveRecovery = true;
5009                 if (StandbyModeRequested)
5010                         StandbyMode = true;
5011
5012                 /*
5013                  * When a backup_label file is present, we want to roll forward from
5014                  * the checkpoint it identifies, rather than using pg_control.
5015                  */
5016                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
5017                 if (record != NULL)
5018                 {
5019                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5020                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5021                         ereport(DEBUG1,
5022                                         (errmsg("checkpoint record is at %X/%X",
5023                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5024                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
5025
5026                         /*
5027                          * Make sure that REDO location exists. This may not be the case
5028                          * if there was a crash during an online backup, which left a
5029                          * backup_label around that references a WAL segment that's
5030                          * already been archived.
5031                          */
5032                         if (checkPoint.redo < checkPointLoc)
5033                         {
5034                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
5035                                         ereport(FATAL,
5036                                                         (errmsg("could not find redo location referenced by checkpoint record"),
5037                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5038                         }
5039                 }
5040                 else
5041                 {
5042                         ereport(FATAL,
5043                                         (errmsg("could not locate required checkpoint record"),
5044                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5045                         wasShutdown = false;    /* keep compiler quiet */
5046                 }
5047                 /* set flag to delete it later */
5048                 haveBackupLabel = true;
5049         }
5050         else
5051         {
5052                 /*
5053                  * It's possible that archive recovery was requested, but we don't
5054                  * know how far we need to replay the WAL before we reach consistency.
5055                  * This can happen for example if a base backup is taken from a
5056                  * running server using an atomic filesystem snapshot, without calling
5057                  * pg_start/stop_backup. Or if you just kill a running master server
5058                  * and put it into archive recovery by creating a recovery.conf file.
5059                  *
5060                  * Our strategy in that case is to perform crash recovery first,
5061                  * replaying all the WAL present in pg_xlog, and only enter archive
5062                  * recovery after that.
5063                  *
5064                  * But usually we already know how far we need to replay the WAL (up
5065                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
5066                  * end-of-backup record), and we can enter archive recovery directly.
5067                  */
5068                 if (ArchiveRecoveryRequested &&
5069                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
5070                          ControlFile->backupEndRequired ||
5071                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
5072                          ControlFile->state == DB_SHUTDOWNED))
5073                 {
5074                         InArchiveRecovery = true;
5075                         if (StandbyModeRequested)
5076                                 StandbyMode = true;
5077                 }
5078
5079                 /*
5080                  * Get the last valid checkpoint record.  If the latest one according
5081                  * to pg_control is broken, try the next-to-last one.
5082                  */
5083                 checkPointLoc = ControlFile->checkPoint;
5084                 RedoStartLSN = ControlFile->checkPointCopy.redo;
5085                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
5086                 if (record != NULL)
5087                 {
5088                         ereport(DEBUG1,
5089                                         (errmsg("checkpoint record is at %X/%X",
5090                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5091                 }
5092                 else if (StandbyMode)
5093                 {
5094                         /*
5095                          * The last valid checkpoint record required for a streaming
5096                          * recovery exists in neither standby nor the primary.
5097                          */
5098                         ereport(PANIC,
5099                                         (errmsg("could not locate a valid checkpoint record")));
5100                 }
5101                 else
5102                 {
5103                         checkPointLoc = ControlFile->prevCheckPoint;
5104                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
5105                         if (record != NULL)
5106                         {
5107                                 ereport(LOG,
5108                                                 (errmsg("using previous checkpoint record at %X/%X",
5109                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5110                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
5111                         }
5112                         else
5113                                 ereport(PANIC,
5114                                          (errmsg("could not locate a valid checkpoint record")));
5115                 }
5116                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5117                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5118         }
5119
5120         /*
5121          * If the location of the checkpoint record is not on the expected
5122          * timeline in the history of the requested timeline, we cannot proceed:
5123          * the backup is not part of the history of the requested timeline.
5124          */
5125         Assert(expectedTLEs);           /* was initialized by reading checkpoint
5126                                                                  * record */
5127         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
5128                 checkPoint.ThisTimeLineID)
5129         {
5130                 XLogRecPtr      switchpoint;
5131
5132                 /*
5133                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
5134                  * not in expectedTLEs at all.
5135                  */
5136                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
5137                 ereport(FATAL,
5138                                 (errmsg("requested timeline %u is not a child of this server's history",
5139                                                 recoveryTargetTLI),
5140                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X",
5141                                                    (uint32) (ControlFile->checkPoint >> 32),
5142                                                    (uint32) ControlFile->checkPoint,
5143                                                    ControlFile->checkPointCopy.ThisTimeLineID,
5144                                                    (uint32) (switchpoint >> 32),
5145                                                    (uint32) switchpoint)));
5146         }
5147
5148         /*
5149          * The min recovery point should be part of the requested timeline's
5150          * history, too.
5151          */
5152         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
5153           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
5154                 ControlFile->minRecoveryPointTLI)
5155                 ereport(FATAL,
5156                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
5157                                                 recoveryTargetTLI,
5158                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
5159                                                 (uint32) ControlFile->minRecoveryPoint,
5160                                                 ControlFile->minRecoveryPointTLI)));
5161
5162         LastRec = RecPtr = checkPointLoc;
5163
5164         ereport(DEBUG1,
5165                         (errmsg("redo record is at %X/%X; shutdown %s",
5166                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
5167                                         wasShutdown ? "TRUE" : "FALSE")));
5168         ereport(DEBUG1,
5169                         (errmsg("next transaction ID: %u/%u; next OID: %u",
5170                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
5171                                         checkPoint.nextOid)));
5172         ereport(DEBUG1,
5173                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
5174                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5175         ereport(DEBUG1,
5176                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
5177                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
5178         ereport(DEBUG1,
5179                         (errmsg("oldest MultiXactId: %u, in database %u",
5180                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
5181         if (!TransactionIdIsNormal(checkPoint.nextXid))
5182                 ereport(PANIC,
5183                                 (errmsg("invalid next transaction ID")));
5184
5185         /* initialize shared memory variables from the checkpoint record */
5186         ShmemVariableCache->nextXid = checkPoint.nextXid;
5187         ShmemVariableCache->nextOid = checkPoint.nextOid;
5188         ShmemVariableCache->oidCount = 0;
5189         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5190         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5191         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
5192         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
5193         XLogCtl->ckptXid = checkPoint.nextXid;
5194
5195         /*
5196          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
5197          * control file. On recovery, all unlogged relations are blown away, so
5198          * the unlogged LSN counter can be reset too.
5199          */
5200         if (ControlFile->state == DB_SHUTDOWNED)
5201                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
5202         else
5203                 XLogCtl->unloggedLSN = 1;
5204
5205         /*
5206          * We must replay WAL entries using the same TimeLineID they were created
5207          * under, so temporarily adopt the TLI indicated by the checkpoint (see
5208          * also xlog_redo()).
5209          */
5210         ThisTimeLineID = checkPoint.ThisTimeLineID;
5211
5212         /*
5213          * Copy any missing timeline history files between 'now' and the recovery
5214          * target timeline from archive to pg_xlog. While we don't need those
5215          * files ourselves - the history file of the recovery target timeline
5216          * covers all the previous timelines in the history too - a cascading
5217          * standby server might be interested in them. Or, if you archive the WAL
5218          * from this server to a different archive than the master, it'd be good
5219          * for all the history files to get archived there after failover, so that
5220          * you can use one of the old timelines as a PITR target. Timeline history
5221          * files are small, so it's better to copy them unnecessarily than not
5222          * copy them and regret later.
5223          */
5224         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
5225
5226         lastFullPageWrites = checkPoint.fullPageWrites;
5227
5228         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5229
5230         if (RecPtr < checkPoint.redo)
5231                 ereport(PANIC,
5232                                 (errmsg("invalid redo in checkpoint record")));
5233
5234         /*
5235          * Check whether we need to force recovery from WAL.  If it appears to
5236          * have been a clean shutdown and we did not have a recovery.conf file,
5237          * then assume no recovery needed.
5238          */
5239         if (checkPoint.redo < RecPtr)
5240         {
5241                 if (wasShutdown)
5242                         ereport(PANIC,
5243                                         (errmsg("invalid redo record in shutdown checkpoint")));
5244                 InRecovery = true;
5245         }
5246         else if (ControlFile->state != DB_SHUTDOWNED)
5247                 InRecovery = true;
5248         else if (ArchiveRecoveryRequested)
5249         {
5250                 /* force recovery due to presence of recovery.conf */
5251                 InRecovery = true;
5252         }
5253
5254         /* REDO */
5255         if (InRecovery)
5256         {
5257                 int                     rmid;
5258
5259                 /* use volatile pointer to prevent code rearrangement */
5260                 volatile XLogCtlData *xlogctl = XLogCtl;
5261
5262                 /*
5263                  * Update pg_control to show that we are recovering and to show the
5264                  * selected checkpoint as the place we are starting from. We also mark
5265                  * pg_control with any minimum recovery stop point obtained from a
5266                  * backup history file.
5267                  */
5268                 dbstate_at_startup = ControlFile->state;
5269                 if (InArchiveRecovery)
5270                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5271                 else
5272                 {
5273                         ereport(LOG,
5274                                         (errmsg("database system was not properly shut down; "
5275                                                         "automatic recovery in progress")));
5276                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
5277                                 ereport(LOG,
5278                                                 (errmsg("crash recovery starts in timeline %u "
5279                                                                 "and has target timeline %u",
5280                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
5281                                                                 recoveryTargetTLI)));
5282                         ControlFile->state = DB_IN_CRASH_RECOVERY;
5283                 }
5284                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
5285                 ControlFile->checkPoint = checkPointLoc;
5286                 ControlFile->checkPointCopy = checkPoint;
5287                 if (InArchiveRecovery)
5288                 {
5289                         /* initialize minRecoveryPoint if not set yet */
5290                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
5291                         {
5292                                 ControlFile->minRecoveryPoint = checkPoint.redo;
5293                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
5294                         }
5295                 }
5296
5297                 /*
5298                  * Set backupStartPoint if we're starting recovery from a base backup.
5299                  *
5300                  * Set backupEndPoint and use minRecoveryPoint as the backup end
5301                  * location if we're starting recovery from a base backup which was
5302                  * taken from the standby. In this case, the database system status in
5303                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
5304                  * means that backup is corrupted, so we cancel recovery.
5305                  */
5306                 if (haveBackupLabel)
5307                 {
5308                         ControlFile->backupStartPoint = checkPoint.redo;
5309                         ControlFile->backupEndRequired = backupEndRequired;
5310
5311                         if (backupFromStandby)
5312                         {
5313                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
5314                                         ereport(FATAL,
5315                                                         (errmsg("backup_label contains data inconsistent with control file"),
5316                                                          errhint("This means that the backup is corrupted and you will "
5317                                                            "have to use another backup for recovery.")));
5318                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
5319                         }
5320                 }
5321                 ControlFile->time = (pg_time_t) time(NULL);
5322                 /* No need to hold ControlFileLock yet, we aren't up far enough */
5323                 UpdateControlFile();
5324
5325                 /* initialize our local copy of minRecoveryPoint */
5326                 minRecoveryPoint = ControlFile->minRecoveryPoint;
5327                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
5328
5329                 /*
5330                  * Reset pgstat data, because it may be invalid after recovery.
5331                  */
5332                 pgstat_reset_all();
5333
5334                 /*
5335                  * If there was a backup label file, it's done its job and the info
5336                  * has now been propagated into pg_control.  We must get rid of the
5337                  * label file so that if we crash during recovery, we'll pick up at
5338                  * the latest recovery restartpoint instead of going all the way back
5339                  * to the backup start point.  It seems prudent though to just rename
5340                  * the file out of the way rather than delete it completely.
5341                  */
5342                 if (haveBackupLabel)
5343                 {
5344                         unlink(BACKUP_LABEL_OLD);
5345                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
5346                                 ereport(FATAL,
5347                                                 (errcode_for_file_access(),
5348                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5349                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
5350                 }
5351
5352                 /* Check that the GUCs used to generate the WAL allow recovery */
5353                 CheckRequiredParameterValues();
5354
5355                 /*
5356                  * We're in recovery, so unlogged relations may be trashed and must be
5357                  * reset.  This should be done BEFORE allowing Hot Standby
5358                  * connections, so that read-only backends don't try to read whatever
5359                  * garbage is left over from before.
5360                  */
5361                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
5362
5363                 /*
5364                  * Likewise, delete any saved transaction snapshot files that got left
5365                  * behind by crashed backends.
5366                  */
5367                 DeleteAllExportedSnapshotFiles();
5368
5369                 /*
5370                  * Initialize for Hot Standby, if enabled. We won't let backends in
5371                  * yet, not until we've reached the min recovery point specified in
5372                  * control file and we've established a recovery snapshot from a
5373                  * running-xacts WAL record.
5374                  */
5375                 if (ArchiveRecoveryRequested && EnableHotStandby)
5376                 {
5377                         TransactionId *xids;
5378                         int                     nxids;
5379
5380                         ereport(DEBUG1,
5381                                         (errmsg("initializing for hot standby")));
5382
5383                         InitRecoveryTransactionEnvironment();
5384
5385                         if (wasShutdown)
5386                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
5387                         else
5388                                 oldestActiveXID = checkPoint.oldestActiveXid;
5389                         Assert(TransactionIdIsValid(oldestActiveXID));
5390
5391                         /*
5392                          * Startup commit log and subtrans only. Other SLRUs are not
5393                          * maintained during recovery and need not be started yet.
5394                          */
5395                         StartupCLOG();
5396                         StartupSUBTRANS(oldestActiveXID);
5397
5398                         /*
5399                          * If we're beginning at a shutdown checkpoint, we know that
5400                          * nothing was running on the master at this point. So fake-up an
5401                          * empty running-xacts record and use that here and now. Recover
5402                          * additional standby state for prepared transactions.
5403                          */
5404                         if (wasShutdown)
5405                         {
5406                                 RunningTransactionsData running;
5407                                 TransactionId latestCompletedXid;
5408
5409                                 /*
5410                                  * Construct a RunningTransactions snapshot representing a
5411                                  * shut down server, with only prepared transactions still
5412                                  * alive. We're never overflowed at this point because all
5413                                  * subxids are listed with their parent prepared transactions.
5414                                  */
5415                                 running.xcnt = nxids;
5416                                 running.subxcnt = 0;
5417                                 running.subxid_overflow = false;
5418                                 running.nextXid = checkPoint.nextXid;
5419                                 running.oldestRunningXid = oldestActiveXID;
5420                                 latestCompletedXid = checkPoint.nextXid;
5421                                 TransactionIdRetreat(latestCompletedXid);
5422                                 Assert(TransactionIdIsNormal(latestCompletedXid));
5423                                 running.latestCompletedXid = latestCompletedXid;
5424                                 running.xids = xids;
5425
5426                                 ProcArrayApplyRecoveryInfo(&running);
5427
5428                                 StandbyRecoverPreparedTransactions(false);
5429                         }
5430                 }
5431
5432                 /* Initialize resource managers */
5433                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5434                 {
5435                         if (RmgrTable[rmid].rm_startup != NULL)
5436                                 RmgrTable[rmid].rm_startup();
5437                 }
5438
5439                 /*
5440                  * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
5441                  * recoveryLastXTime.
5442                  *
5443                  * This is slightly confusing if we're starting from an online
5444                  * checkpoint; we've just read and replayed the chekpoint record, but
5445                  * we're going to start replay from its redo pointer, which precedes
5446                  * the location of the checkpoint record itself. So even though the
5447                  * last record we've replayed is indeed ReadRecPtr, we haven't
5448                  * replayed all the preceding records yet. That's OK for the current
5449                  * use of these variables.
5450                  */
5451                 SpinLockAcquire(&xlogctl->info_lck);
5452                 xlogctl->replayEndRecPtr = ReadRecPtr;
5453                 xlogctl->replayEndTLI = ThisTimeLineID;
5454                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
5455                 xlogctl->lastReplayedTLI = ThisTimeLineID;
5456                 xlogctl->recoveryLastXTime = 0;
5457                 xlogctl->currentChunkStartTime = 0;
5458                 xlogctl->recoveryPause = false;
5459                 SpinLockRelease(&xlogctl->info_lck);
5460
5461                 /* Also ensure XLogReceiptTime has a sane value */
5462                 XLogReceiptTime = GetCurrentTimestamp();
5463
5464                 /*
5465                  * Let postmaster know we've started redo now, so that it can launch
5466                  * checkpointer to perform restartpoints.  We don't bother during
5467                  * crash recovery as restartpoints can only be performed during
5468                  * archive recovery.  And we'd like to keep crash recovery simple, to
5469                  * avoid introducing bugs that could affect you when recovering after
5470                  * crash.
5471                  *
5472                  * After this point, we can no longer assume that we're the only
5473                  * process in addition to postmaster!  Also, fsync requests are
5474                  * subsequently to be handled by the checkpointer, not locally.
5475                  */
5476                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
5477                 {
5478                         PublishStartupProcessInformation();
5479                         SetForwardFsyncRequests();
5480                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
5481                         bgwriterLaunched = true;
5482                 }
5483
5484                 /*
5485                  * Allow read-only connections immediately if we're consistent
5486                  * already.
5487                  */
5488                 CheckRecoveryConsistency();
5489
5490                 /*
5491                  * Find the first record that logically follows the checkpoint --- it
5492                  * might physically precede it, though.
5493                  */
5494                 if (checkPoint.redo < RecPtr)
5495                 {
5496                         /* back up to find the record */
5497                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
5498                 }
5499                 else
5500                 {
5501                         /* just have to read next record after CheckPoint */
5502                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
5503                 }
5504
5505                 if (record != NULL)
5506                 {
5507                         bool            recoveryContinue = true;
5508                         bool            recoveryApply = true;
5509                         ErrorContextCallback errcallback;
5510                         TimestampTz xtime;
5511
5512                         InRedo = true;
5513
5514                         ereport(LOG,
5515                                         (errmsg("redo starts at %X/%X",
5516                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
5517
5518                         /*
5519                          * main redo apply loop
5520                          */
5521                         do
5522                         {
5523                                 bool            switchedTLI = false;
5524
5525 #ifdef WAL_DEBUG
5526                                 if (XLOG_DEBUG ||
5527                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
5528                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
5529                                 {
5530                                         StringInfoData buf;
5531
5532                                         initStringInfo(&buf);
5533                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
5534                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
5535                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
5536                                         xlog_outrec(&buf, record);
5537                                         appendStringInfo(&buf, " - ");
5538                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
5539                                                                                                            record->xl_info,
5540                                                                                                          XLogRecGetData(record));
5541                                         elog(LOG, "%s", buf.data);
5542                                         pfree(buf.data);
5543                                 }
5544 #endif
5545
5546                                 /* Handle interrupt signals of startup process */
5547                                 HandleStartupProcInterrupts();
5548
5549                                 /*
5550                                  * Pause WAL replay, if requested by a hot-standby session via
5551                                  * SetRecoveryPause().
5552                                  *
5553                                  * Note that we intentionally don't take the info_lck spinlock
5554                                  * here.  We might therefore read a slightly stale value of
5555                                  * the recoveryPause flag, but it can't be very stale (no
5556                                  * worse than the last spinlock we did acquire).  Since a
5557                                  * pause request is a pretty asynchronous thing anyway,
5558                                  * possibly responding to it one WAL record later than we
5559                                  * otherwise would is a minor issue, so it doesn't seem worth
5560                                  * adding another spinlock cycle to prevent that.
5561                                  */
5562                                 if (xlogctl->recoveryPause)
5563                                         recoveryPausesHere();
5564
5565                                 /*
5566                                  * Have we reached our recovery target?
5567                                  */
5568                                 if (recoveryStopsHere(record, &recoveryApply))
5569                                 {
5570                                         if (recoveryPauseAtTarget)
5571                                         {
5572                                                 SetRecoveryPause(true);
5573                                                 recoveryPausesHere();
5574                                         }
5575                                         reachedStopPoint = true;        /* see below */
5576                                         recoveryContinue = false;
5577
5578                                         /* Exit loop if we reached non-inclusive recovery target */
5579                                         if (!recoveryApply)
5580                                                 break;
5581                                 }
5582
5583                                 /* Setup error traceback support for ereport() */
5584                                 errcallback.callback = rm_redo_error_callback;
5585                                 errcallback.arg = (void *) record;
5586                                 errcallback.previous = error_context_stack;
5587                                 error_context_stack = &errcallback;
5588
5589                                 /*
5590                                  * ShmemVariableCache->nextXid must be beyond record's xid.
5591                                  *
5592                                  * We don't expect anyone else to modify nextXid, hence we
5593                                  * don't need to hold a lock while examining it.  We still
5594                                  * acquire the lock to modify it, though.
5595                                  */
5596                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
5597                                                                                                  ShmemVariableCache->nextXid))
5598                                 {
5599                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
5600                                         ShmemVariableCache->nextXid = record->xl_xid;
5601                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
5602                                         LWLockRelease(XidGenLock);
5603                                 }
5604
5605                                 /*
5606                                  * Before replaying this record, check if this record causes
5607                                  * the current timeline to change. The record is already
5608                                  * considered to be part of the new timeline, so we update
5609                                  * ThisTimeLineID before replaying it. That's important so
5610                                  * that replayEndTLI, which is recorded as the minimum
5611                                  * recovery point's TLI if recovery stops after this record,
5612                                  * is set correctly.
5613                                  */
5614                                 if (record->xl_rmid == RM_XLOG_ID)
5615                                 {
5616                                         TimeLineID      newTLI = ThisTimeLineID;
5617                                         TimeLineID      prevTLI = ThisTimeLineID;
5618                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
5619
5620                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
5621                                         {
5622                                                 CheckPoint      checkPoint;
5623
5624                                                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5625                                                 newTLI = checkPoint.ThisTimeLineID;
5626                                                 prevTLI = checkPoint.PrevTimeLineID;
5627                                         }
5628                                         else if (info == XLOG_END_OF_RECOVERY)
5629                                         {
5630                                                 xl_end_of_recovery xlrec;
5631
5632                                                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
5633                                                 newTLI = xlrec.ThisTimeLineID;
5634                                                 prevTLI = xlrec.PrevTimeLineID;
5635                                         }
5636
5637                                         if (newTLI != ThisTimeLineID)
5638                                         {
5639                                                 /* Check that it's OK to switch to this TLI */
5640                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
5641
5642                                                 /* Following WAL records should be run with new TLI */
5643                                                 ThisTimeLineID = newTLI;
5644                                                 switchedTLI = true;
5645                                         }
5646                                 }
5647
5648                                 /*
5649                                  * Update shared replayEndRecPtr before replaying this record,
5650                                  * so that XLogFlush will update minRecoveryPoint correctly.
5651                                  */
5652                                 SpinLockAcquire(&xlogctl->info_lck);
5653                                 xlogctl->replayEndRecPtr = EndRecPtr;
5654                                 xlogctl->replayEndTLI = ThisTimeLineID;
5655                                 SpinLockRelease(&xlogctl->info_lck);
5656
5657                                 /*
5658                                  * If we are attempting to enter Hot Standby mode, process
5659                                  * XIDs we see
5660                                  */
5661                                 if (standbyState >= STANDBY_INITIALIZED &&
5662                                         TransactionIdIsValid(record->xl_xid))
5663                                         RecordKnownAssignedTransactionIds(record->xl_xid);
5664
5665                                 /* Now apply the WAL record itself */
5666                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
5667
5668                                 /* Pop the error context stack */
5669                                 error_context_stack = errcallback.previous;
5670
5671                                 /*
5672                                  * Update lastReplayedEndRecPtr after this record has been
5673                                  * successfully replayed.
5674                                  */
5675                                 SpinLockAcquire(&xlogctl->info_lck);
5676                                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
5677                                 xlogctl->lastReplayedTLI = ThisTimeLineID;
5678                                 SpinLockRelease(&xlogctl->info_lck);
5679
5680                                 /* Remember this record as the last-applied one */
5681                                 LastRec = ReadRecPtr;
5682
5683                                 /* Allow read-only connections if we're consistent now */
5684                                 CheckRecoveryConsistency();
5685
5686                                 /*
5687                                  * If this record was a timeline switch, wake up any
5688                                  * walsenders to notice that we are on a new timeline.
5689                                  */
5690                                 if (switchedTLI && AllowCascadeReplication())
5691                                         WalSndWakeup();
5692
5693                                 /* Exit loop if we reached inclusive recovery target */
5694                                 if (!recoveryContinue)
5695                                         break;
5696
5697                                 /* Else, try to fetch the next WAL record */
5698                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
5699                         } while (record != NULL);
5700
5701                         /*
5702                          * end of main redo apply loop
5703                          */
5704
5705                         ereport(LOG,
5706                                         (errmsg("redo done at %X/%X",
5707                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
5708                         xtime = GetLatestXTime();
5709                         if (xtime)
5710                                 ereport(LOG,
5711                                          (errmsg("last completed transaction was at log time %s",
5712                                                          timestamptz_to_str(xtime))));
5713                         InRedo = false;
5714                 }
5715                 else
5716                 {
5717                         /* there are no WAL records following the checkpoint */
5718                         ereport(LOG,
5719                                         (errmsg("redo is not required")));
5720                 }
5721         }
5722
5723         /*
5724          * Kill WAL receiver, if it's still running, before we continue to write
5725          * the startup checkpoint record. It will trump over the checkpoint and
5726          * subsequent records if it's still alive when we start writing WAL.
5727          */
5728         ShutdownWalRcv();
5729
5730         /*
5731          * We don't need the latch anymore. It's not strictly necessary to disown
5732          * it, but let's do it for the sake of tidiness.
5733          */
5734         if (StandbyModeRequested)
5735                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
5736
5737         /*
5738          * We are now done reading the xlog from stream. Turn off streaming
5739          * recovery to force fetching the files (which would be required at end of
5740          * recovery, e.g., timeline history file) from archive or pg_xlog.
5741          */
5742         StandbyMode = false;
5743
5744         /*
5745          * Re-fetch the last valid or last applied record, so we can identify the
5746          * exact endpoint of what we consider the valid portion of WAL.
5747          */
5748         record = ReadRecord(xlogreader, LastRec, PANIC, false);
5749         EndOfLog = EndRecPtr;
5750         XLByteToPrevSeg(EndOfLog, endLogSegNo);
5751
5752         /*
5753          * Complain if we did not roll forward far enough to render the backup
5754          * dump consistent.  Note: it is indeed okay to look at the local variable
5755          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
5756          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
5757          * advanced beyond the WAL we processed.
5758          */
5759         if (InRecovery &&
5760                 (EndOfLog < minRecoveryPoint ||
5761                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
5762         {
5763                 if (reachedStopPoint)
5764                 {
5765                         /* stopped because of stop request */
5766                         ereport(FATAL,
5767                                         (errmsg("requested recovery stop point is before consistent recovery point")));
5768                 }
5769
5770                 /*
5771                  * Ran off end of WAL before reaching end-of-backup WAL record, or
5772                  * minRecoveryPoint. That's usually a bad sign, indicating that you
5773                  * tried to recover from an online backup but never called
5774                  * pg_stop_backup(), or you didn't archive all the WAL up to that
5775                  * point. However, this also happens in crash recovery, if the system
5776                  * crashes while an online backup is in progress. We must not treat
5777                  * that as an error, or the database will refuse to start up.
5778                  */
5779                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
5780                 {
5781                         if (ControlFile->backupEndRequired)
5782                                 ereport(FATAL,
5783                                                 (errmsg("WAL ends before end of online backup"),
5784                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
5785                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
5786                                 ereport(FATAL,
5787                                                 (errmsg("WAL ends before end of online backup"),
5788                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
5789                         else
5790                                 ereport(FATAL,
5791                                           (errmsg("WAL ends before consistent recovery point")));
5792                 }
5793         }
5794
5795         /*
5796          * Consider whether we need to assign a new timeline ID.
5797          *
5798          * If we are doing an archive recovery, we always assign a new ID.      This
5799          * handles a couple of issues.  If we stopped short of the end of WAL
5800          * during recovery, then we are clearly generating a new timeline and must
5801          * assign it a unique new ID.  Even if we ran to the end, modifying the
5802          * current last segment is problematic because it may result in trying to
5803          * overwrite an already-archived copy of that segment, and we encourage
5804          * DBAs to make their archive_commands reject that.  We can dodge the
5805          * problem by making the new active segment have a new timeline ID.
5806          *
5807          * In a normal crash recovery, we can just extend the timeline we were in.
5808          */
5809         PrevTimeLineID = ThisTimeLineID;
5810         if (ArchiveRecoveryRequested)
5811         {
5812                 char            reason[200];
5813
5814                 Assert(InArchiveRecovery);
5815
5816                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
5817                 ereport(LOG,
5818                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
5819
5820                 /*
5821                  * Create a comment for the history file to explain why and where
5822                  * timeline changed.
5823                  */
5824                 if (recoveryTarget == RECOVERY_TARGET_XID)
5825                         snprintf(reason, sizeof(reason),
5826                                          "%s transaction %u",
5827                                          recoveryStopAfter ? "after" : "before",
5828                                          recoveryStopXid);
5829                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
5830                         snprintf(reason, sizeof(reason),
5831                                          "%s %s\n",
5832                                          recoveryStopAfter ? "after" : "before",
5833                                          timestamptz_to_str(recoveryStopTime));
5834                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
5835                         snprintf(reason, sizeof(reason),
5836                                          "at restore point \"%s\"",
5837                                          recoveryStopName);
5838                 else
5839                         snprintf(reason, sizeof(reason), "no recovery target specified");
5840
5841                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
5842                                                          EndRecPtr, reason);
5843         }
5844
5845         /* Save the selected TimeLineID in shared memory, too */
5846         XLogCtl->ThisTimeLineID = ThisTimeLineID;
5847         XLogCtl->PrevTimeLineID = PrevTimeLineID;
5848
5849         /*
5850          * We are now done reading the old WAL.  Turn off archive fetching if it
5851          * was active, and make a writable copy of the last WAL segment. (Note
5852          * that we also have a copy of the last block of the old WAL in readBuf;
5853          * we will use that below.)
5854          */
5855         if (ArchiveRecoveryRequested)
5856                 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
5857
5858         /*
5859          * Prepare to write WAL starting at EndOfLog position, and init xlog
5860          * buffer cache using the block containing the last record from the
5861          * previous incarnation.
5862          */
5863         openLogSegNo = endLogSegNo;
5864         openLogFile = XLogFileOpen(openLogSegNo);
5865         openLogOff = 0;
5866         Insert = &XLogCtl->Insert;
5867         Insert->PrevRecord = LastRec;
5868         XLogCtl->xlblocks[0] = ((EndOfLog - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
5869
5870         /*
5871          * Tricky point here: readBuf contains the *last* block that the LastRec
5872          * record spans, not the one it starts in.      The last block is indeed the
5873          * one we want to use.
5874          */
5875         if (EndOfLog % XLOG_BLCKSZ == 0)
5876         {
5877                 memset(Insert->currpage, 0, XLOG_BLCKSZ);
5878         }
5879         else
5880         {
5881                 Assert(readOff == (XLogCtl->xlblocks[0] - XLOG_BLCKSZ) % XLogSegSize);
5882                 memcpy((char *) Insert->currpage, xlogreader->readBuf, XLOG_BLCKSZ);
5883         }
5884         Insert->currpos = (char *) Insert->currpage +
5885                 (EndOfLog + XLOG_BLCKSZ - XLogCtl->xlblocks[0]);
5886
5887         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
5888
5889         XLogCtl->LogwrtResult = LogwrtResult;
5890
5891         XLogCtl->LogwrtRqst.Write = EndOfLog;
5892         XLogCtl->LogwrtRqst.Flush = EndOfLog;
5893
5894         freespace = INSERT_FREESPACE(Insert);
5895         if (freespace > 0)
5896         {
5897                 /* Make sure rest of page is zero */
5898                 MemSet(Insert->currpos, 0, freespace);
5899                 XLogCtl->Write.curridx = 0;
5900         }
5901         else
5902         {
5903                 /*
5904                  * Whenever LogwrtResult points to exactly the end of a page,
5905                  * Write.curridx must point to the *next* page (see XLogWrite()).
5906                  *
5907                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
5908                  * this is sufficient.  The first actual attempt to insert a log
5909                  * record will advance the insert state.
5910                  */
5911                 XLogCtl->Write.curridx = NextBufIdx(0);
5912         }
5913
5914         /* Pre-scan prepared transactions to find out the range of XIDs present */
5915         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
5916
5917         /*
5918          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
5919          * record before resource manager writes cleanup WAL records or checkpoint
5920          * record is written.
5921          */
5922         Insert->fullPageWrites = lastFullPageWrites;
5923         LocalSetXLogInsertAllowed();
5924         UpdateFullPageWrites();
5925         LocalXLogInsertAllowed = -1;
5926
5927         if (InRecovery)
5928         {
5929                 int                     rmid;
5930
5931                 /*
5932                  * Resource managers might need to write WAL records, eg, to record
5933                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
5934                  * this process only.
5935                  */
5936                 LocalSetXLogInsertAllowed();
5937
5938                 /*
5939                  * Allow resource managers to do any required cleanup.
5940                  */
5941                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5942                 {
5943                         if (RmgrTable[rmid].rm_cleanup != NULL)
5944                                 RmgrTable[rmid].rm_cleanup();
5945                 }
5946
5947                 /* Disallow XLogInsert again */
5948                 LocalXLogInsertAllowed = -1;
5949
5950                 /*
5951                  * Perform a checkpoint to update all our recovery activity to disk.
5952                  *
5953                  * Note that we write a shutdown checkpoint rather than an on-line
5954                  * one. This is not particularly critical, but since we may be
5955                  * assigning a new TLI, using a shutdown checkpoint allows us to have
5956                  * the rule that TLI only changes in shutdown checkpoints, which
5957                  * allows some extra error checking in xlog_redo.
5958                  *
5959                  * In fast promotion, only create a lightweight end-of-recovery record
5960                  * instead of a full checkpoint. A checkpoint is requested later,
5961                  * after we're fully out of recovery mode and already accepting
5962                  * queries.
5963                  */
5964                 if (bgwriterLaunched)
5965                 {
5966                         if (fast_promote)
5967                         {
5968                                 checkPointLoc = ControlFile->prevCheckPoint;
5969
5970                                 /*
5971                                  * Confirm the last checkpoint is available for us to recover
5972                                  * from if we fail. Note that we don't check for the secondary
5973                                  * checkpoint since that isn't available in most base backups.
5974                                  */
5975                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
5976                                 if (record != NULL)
5977                                 {
5978                                         fast_promoted = true;
5979
5980                                         /*
5981                                          * Insert a special WAL record to mark the end of
5982                                          * recovery, since we aren't doing a checkpoint. That
5983                                          * means that the checkpointer process may likely be in
5984                                          * the middle of a time-smoothed restartpoint and could
5985                                          * continue to be for minutes after this. That sounds
5986                                          * strange, but the effect is roughly the same and it
5987                                          * would be stranger to try to come out of the
5988                                          * restartpoint and then checkpoint. We request a
5989                                          * checkpoint later anyway, just for safety.
5990                                          */
5991                                         CreateEndOfRecoveryRecord();
5992                                 }
5993                         }
5994
5995                         if (!fast_promoted)
5996                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
5997                                                                   CHECKPOINT_IMMEDIATE |
5998                                                                   CHECKPOINT_WAIT);
5999                 }
6000                 else
6001                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6002
6003                 /*
6004                  * And finally, execute the recovery_end_command, if any.
6005                  */
6006                 if (recoveryEndCommand)
6007                         ExecuteRecoveryCommand(recoveryEndCommand,
6008                                                                    "recovery_end_command",
6009                                                                    true);
6010         }
6011
6012         /*
6013          * Preallocate additional log files, if wanted.
6014          */
6015         PreallocXlogFiles(EndOfLog);
6016
6017         /*
6018          * Reset initial contents of unlogged relations.  This has to be done
6019          * AFTER recovery is complete so that any unlogged relations created
6020          * during recovery also get picked up.
6021          */
6022         if (InRecovery)
6023                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
6024
6025         /*
6026          * Okay, we're officially UP.
6027          */
6028         InRecovery = false;
6029
6030         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6031         ControlFile->state = DB_IN_PRODUCTION;
6032         ControlFile->time = (pg_time_t) time(NULL);
6033         UpdateControlFile();
6034         LWLockRelease(ControlFileLock);
6035
6036         /* start the archive_timeout timer running */
6037         XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
6038
6039         /* also initialize latestCompletedXid, to nextXid - 1 */
6040         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6041         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
6042         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
6043         LWLockRelease(ProcArrayLock);
6044
6045         /*
6046          * Start up the commit log and subtrans, if not already done for hot
6047          * standby.
6048          */
6049         if (standbyState == STANDBY_DISABLED)
6050         {
6051                 StartupCLOG();
6052                 StartupSUBTRANS(oldestActiveXID);
6053         }
6054
6055         /*
6056          * Perform end of recovery actions for any SLRUs that need it.
6057          */
6058         StartupMultiXact();
6059         TrimCLOG();
6060
6061         /* Reload shared-memory state for prepared transactions */
6062         RecoverPreparedTransactions();
6063
6064         /*
6065          * Shutdown the recovery environment. This must occur after
6066          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
6067          */
6068         if (standbyState != STANDBY_DISABLED)
6069                 ShutdownRecoveryTransactionEnvironment();
6070
6071         /* Shut down xlogreader */
6072         if (readFile >= 0)
6073         {
6074                 close(readFile);
6075                 readFile = -1;
6076         }
6077         XLogReaderFree(xlogreader);
6078
6079         /*
6080          * If any of the critical GUCs have changed, log them before we allow
6081          * backends to write WAL.
6082          */
6083         LocalSetXLogInsertAllowed();
6084         XLogReportParameters();
6085
6086         /*
6087          * All done.  Allow backends to write WAL.      (Although the bool flag is
6088          * probably atomic in itself, we use the info_lck here to ensure that
6089          * there are no race conditions concerning visibility of other recent
6090          * updates to shared memory.)
6091          */
6092         {
6093                 /* use volatile pointer to prevent code rearrangement */
6094                 volatile XLogCtlData *xlogctl = XLogCtl;
6095
6096                 SpinLockAcquire(&xlogctl->info_lck);
6097                 xlogctl->SharedRecoveryInProgress = false;
6098                 SpinLockRelease(&xlogctl->info_lck);
6099         }
6100
6101         /*
6102          * If there were cascading standby servers connected to us, nudge any wal
6103          * sender processes to notice that we've been promoted.
6104          */
6105         WalSndWakeup();
6106
6107         /*
6108          * If this was a fast promotion, request an (online) checkpoint now. This
6109          * isn't required for consistency, but the last restartpoint might be far
6110          * back, and in case of a crash, recovering from it might take a longer
6111          * than is appropriate now that we're not in standby mode anymore.
6112          */
6113         if (fast_promoted)
6114                 RequestCheckpoint(CHECKPOINT_FORCE);
6115 }
6116
6117 /*
6118  * Checks if recovery has reached a consistent state. When consistency is
6119  * reached and we have a valid starting standby snapshot, tell postmaster
6120  * that it can start accepting read-only connections.
6121  */
6122 static void
6123 CheckRecoveryConsistency(void)
6124 {
6125         /*
6126          * During crash recovery, we don't reach a consistent state until we've
6127          * replayed all the WAL.
6128          */
6129         if (XLogRecPtrIsInvalid(minRecoveryPoint))
6130                 return;
6131
6132         /*
6133          * Have we reached the point where our base backup was completed?
6134          */
6135         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
6136                 ControlFile->backupEndPoint <= EndRecPtr)
6137         {
6138                 /*
6139                  * We have reached the end of base backup, as indicated by pg_control.
6140                  * The data on disk is now consistent. Reset backupStartPoint and
6141                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
6142                  * allow starting up at an earlier point even if recovery is stopped
6143                  * and restarted soon after this.
6144                  */
6145                 elog(DEBUG1, "end of backup reached");
6146
6147                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6148
6149                 if (ControlFile->minRecoveryPoint < EndRecPtr)
6150                         ControlFile->minRecoveryPoint = EndRecPtr;
6151
6152                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
6153                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
6154                 ControlFile->backupEndRequired = false;
6155                 UpdateControlFile();
6156
6157                 LWLockRelease(ControlFileLock);
6158         }
6159
6160         /*
6161          * Have we passed our safe starting point? Note that minRecoveryPoint is
6162          * known to be incorrectly set if ControlFile->backupEndRequired, until
6163          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
6164          * minRecoveryPoint. All we know prior to that is that we're not
6165          * consistent yet.
6166          */
6167         if (!reachedConsistency && !ControlFile->backupEndRequired &&
6168                 minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr &&
6169                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6170         {
6171                 /*
6172                  * Check to see if the XLOG sequence contained any unresolved
6173                  * references to uninitialized pages.
6174                  */
6175                 XLogCheckInvalidPages();
6176
6177                 reachedConsistency = true;
6178                 ereport(LOG,
6179                                 (errmsg("consistent recovery state reached at %X/%X",
6180                                                 (uint32) (XLogCtl->lastReplayedEndRecPtr >> 32),
6181                                                 (uint32) XLogCtl->lastReplayedEndRecPtr)));
6182         }
6183
6184         /*
6185          * Have we got a valid starting snapshot that will allow queries to be
6186          * run? If so, we can tell postmaster that the database is consistent now,
6187          * enabling connections.
6188          */
6189         if (standbyState == STANDBY_SNAPSHOT_READY &&
6190                 !LocalHotStandbyActive &&
6191                 reachedConsistency &&
6192                 IsUnderPostmaster)
6193         {
6194                 /* use volatile pointer to prevent code rearrangement */
6195                 volatile XLogCtlData *xlogctl = XLogCtl;
6196
6197                 SpinLockAcquire(&xlogctl->info_lck);
6198                 xlogctl->SharedHotStandbyActive = true;
6199                 SpinLockRelease(&xlogctl->info_lck);
6200
6201                 LocalHotStandbyActive = true;
6202
6203                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
6204         }
6205 }
6206
6207 /*
6208  * Is the system still in recovery?
6209  *
6210  * Unlike testing InRecovery, this works in any process that's connected to
6211  * shared memory.
6212  *
6213  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
6214  * variables the first time we see that recovery is finished.
6215  */
6216 bool
6217 RecoveryInProgress(void)
6218 {
6219         /*
6220          * We check shared state each time only until we leave recovery mode. We
6221          * can't re-enter recovery, so there's no need to keep checking after the
6222          * shared variable has once been seen false.
6223          */
6224         if (!LocalRecoveryInProgress)
6225                 return false;
6226         else
6227         {
6228                 /* use volatile pointer to prevent code rearrangement */
6229                 volatile XLogCtlData *xlogctl = XLogCtl;
6230
6231                 /* spinlock is essential on machines with weak memory ordering! */
6232                 SpinLockAcquire(&xlogctl->info_lck);
6233                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
6234                 SpinLockRelease(&xlogctl->info_lck);
6235
6236                 /*
6237                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
6238                  * is finished. InitPostgres() relies upon this behaviour to ensure
6239                  * that InitXLOGAccess() is called at backend startup.  (If you change
6240                  * this, see also LocalSetXLogInsertAllowed.)
6241                  */
6242                 if (!LocalRecoveryInProgress)
6243                         InitXLOGAccess();
6244
6245                 return LocalRecoveryInProgress;
6246         }
6247 }
6248
6249 /*
6250  * Is HotStandby active yet? This is only important in special backends
6251  * since normal backends won't ever be able to connect until this returns
6252  * true. Postmaster knows this by way of signal, not via shared memory.
6253  *
6254  * Unlike testing standbyState, this works in any process that's connected to
6255  * shared memory.
6256  */
6257 bool
6258 HotStandbyActive(void)
6259 {
6260         /*
6261          * We check shared state each time only until Hot Standby is active. We
6262          * can't de-activate Hot Standby, so there's no need to keep checking
6263          * after the shared variable has once been seen true.
6264          */
6265         if (LocalHotStandbyActive)
6266                 return true;
6267         else
6268         {
6269                 /* use volatile pointer to prevent code rearrangement */
6270                 volatile XLogCtlData *xlogctl = XLogCtl;
6271
6272                 /* spinlock is essential on machines with weak memory ordering! */
6273                 SpinLockAcquire(&xlogctl->info_lck);
6274                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
6275                 SpinLockRelease(&xlogctl->info_lck);
6276
6277                 return LocalHotStandbyActive;
6278         }
6279 }
6280
6281 /*
6282  * Is this process allowed to insert new WAL records?
6283  *
6284  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
6285  * But we also have provisions for forcing the result "true" or "false"
6286  * within specific processes regardless of the global state.
6287  */
6288 bool
6289 XLogInsertAllowed(void)
6290 {
6291         /*
6292          * If value is "unconditionally true" or "unconditionally false", just
6293          * return it.  This provides the normal fast path once recovery is known
6294          * done.
6295          */
6296         if (LocalXLogInsertAllowed >= 0)
6297                 return (bool) LocalXLogInsertAllowed;
6298
6299         /*
6300          * Else, must check to see if we're still in recovery.
6301          */
6302         if (RecoveryInProgress())
6303                 return false;
6304
6305         /*
6306          * On exit from recovery, reset to "unconditionally true", since there is
6307          * no need to keep checking.
6308          */
6309         LocalXLogInsertAllowed = 1;
6310         return true;
6311 }
6312
6313 /*
6314  * Make XLogInsertAllowed() return true in the current process only.
6315  *
6316  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
6317  * and even call LocalSetXLogInsertAllowed() again after that.
6318  */
6319 static void
6320 LocalSetXLogInsertAllowed(void)
6321 {
6322         Assert(LocalXLogInsertAllowed == -1);
6323         LocalXLogInsertAllowed = 1;
6324
6325         /* Initialize as RecoveryInProgress() would do when switching state */
6326         InitXLOGAccess();
6327 }
6328
6329 /*
6330  * Subroutine to try to fetch and validate a prior checkpoint record.
6331  *
6332  * whichChkpt identifies the checkpoint (merely for reporting purposes).
6333  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
6334  */
6335 static XLogRecord *
6336 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
6337                                          int whichChkpt, bool report)
6338 {
6339         XLogRecord *record;
6340
6341         if (!XRecOffIsValid(RecPtr))
6342         {
6343                 if (!report)
6344                         return NULL;
6345
6346                 switch (whichChkpt)
6347                 {
6348                         case 1:
6349                                 ereport(LOG,
6350                                 (errmsg("invalid primary checkpoint link in control file")));
6351                                 break;
6352                         case 2:
6353                                 ereport(LOG,
6354                                                 (errmsg("invalid secondary checkpoint link in control file")));
6355                                 break;
6356                         default:
6357                                 ereport(LOG,
6358                                    (errmsg("invalid checkpoint link in backup_label file")));
6359                                 break;
6360                 }
6361                 return NULL;
6362         }
6363
6364         record = ReadRecord(xlogreader, RecPtr, LOG, true);
6365
6366         if (record == NULL)
6367         {
6368                 if (!report)
6369                         return NULL;
6370
6371                 switch (whichChkpt)
6372                 {
6373                         case 1:
6374                                 ereport(LOG,
6375                                                 (errmsg("invalid primary checkpoint record")));
6376                                 break;
6377                         case 2:
6378                                 ereport(LOG,
6379                                                 (errmsg("invalid secondary checkpoint record")));
6380                                 break;
6381                         default:
6382                                 ereport(LOG,
6383                                                 (errmsg("invalid checkpoint record")));
6384                                 break;
6385                 }
6386                 return NULL;
6387         }
6388         if (record->xl_rmid != RM_XLOG_ID)
6389         {
6390                 switch (whichChkpt)
6391                 {
6392                         case 1:
6393                                 ereport(LOG,
6394                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
6395                                 break;
6396                         case 2:
6397                                 ereport(LOG,
6398                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
6399                                 break;
6400                         default:
6401                                 ereport(LOG,
6402                                 (errmsg("invalid resource manager ID in checkpoint record")));
6403                                 break;
6404                 }
6405                 return NULL;
6406         }
6407         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
6408                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
6409         {
6410                 switch (whichChkpt)
6411                 {
6412                         case 1:
6413                                 ereport(LOG,
6414                                    (errmsg("invalid xl_info in primary checkpoint record")));
6415                                 break;
6416                         case 2:
6417                                 ereport(LOG,
6418                                  (errmsg("invalid xl_info in secondary checkpoint record")));
6419                                 break;
6420                         default:
6421                                 ereport(LOG,
6422                                                 (errmsg("invalid xl_info in checkpoint record")));
6423                                 break;
6424                 }
6425                 return NULL;
6426         }
6427         if (record->xl_len != sizeof(CheckPoint) ||
6428                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
6429         {
6430                 switch (whichChkpt)
6431                 {
6432                         case 1:
6433                                 ereport(LOG,
6434                                         (errmsg("invalid length of primary checkpoint record")));
6435                                 break;
6436                         case 2:
6437                                 ereport(LOG,
6438                                   (errmsg("invalid length of secondary checkpoint record")));
6439                                 break;
6440                         default:
6441                                 ereport(LOG,
6442                                                 (errmsg("invalid length of checkpoint record")));
6443                                 break;
6444                 }
6445                 return NULL;
6446         }
6447         return record;
6448 }
6449
6450 /*
6451  * This must be called during startup of a backend process, except that
6452  * it need not be called in a standalone backend (which does StartupXLOG
6453  * instead).  We need to initialize the local copies of ThisTimeLineID and
6454  * RedoRecPtr.
6455  *
6456  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
6457  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
6458  * unnecessary however, since the postmaster itself never touches XLOG anyway.
6459  */
6460 void
6461 InitXLOGAccess(void)
6462 {
6463         /* ThisTimeLineID doesn't change so we need no lock to copy it */
6464         ThisTimeLineID = XLogCtl->ThisTimeLineID;
6465         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
6466
6467         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
6468         (void) GetRedoRecPtr();
6469 }
6470
6471 /*
6472  * Once spawned, a backend may update its local RedoRecPtr from
6473  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
6474  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
6475  */
6476 XLogRecPtr
6477 GetRedoRecPtr(void)
6478 {
6479         /* use volatile pointer to prevent code rearrangement */
6480         volatile XLogCtlData *xlogctl = XLogCtl;
6481
6482         SpinLockAcquire(&xlogctl->info_lck);
6483         Assert(RedoRecPtr <= xlogctl->Insert.RedoRecPtr);
6484         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
6485         SpinLockRelease(&xlogctl->info_lck);
6486
6487         return RedoRecPtr;
6488 }
6489
6490 /*
6491  * GetInsertRecPtr -- Returns the current insert position.
6492  *
6493  * NOTE: The value *actually* returned is the position of the last full
6494  * xlog page. It lags behind the real insert position by at most 1 page.
6495  * For that, we don't need to acquire WALInsertLock which can be quite
6496  * heavily contended, and an approximation is enough for the current
6497  * usage of this function.
6498  */
6499 XLogRecPtr
6500 GetInsertRecPtr(void)
6501 {
6502         /* use volatile pointer to prevent code rearrangement */
6503         volatile XLogCtlData *xlogctl = XLogCtl;
6504         XLogRecPtr      recptr;
6505
6506         SpinLockAcquire(&xlogctl->info_lck);
6507         recptr = xlogctl->LogwrtRqst.Write;
6508         SpinLockRelease(&xlogctl->info_lck);
6509
6510         return recptr;
6511 }
6512
6513 /*
6514  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
6515  * position known to be fsync'd to disk.
6516  */
6517 XLogRecPtr
6518 GetFlushRecPtr(void)
6519 {
6520         /* use volatile pointer to prevent code rearrangement */
6521         volatile XLogCtlData *xlogctl = XLogCtl;
6522         XLogRecPtr      recptr;
6523
6524         SpinLockAcquire(&xlogctl->info_lck);
6525         recptr = xlogctl->LogwrtResult.Flush;
6526         SpinLockRelease(&xlogctl->info_lck);
6527
6528         return recptr;
6529 }
6530
6531 /*
6532  * Get the time of the last xlog segment switch
6533  */
6534 pg_time_t
6535 GetLastSegSwitchTime(void)
6536 {
6537         pg_time_t       result;
6538
6539         /* Need WALWriteLock, but shared lock is sufficient */
6540         LWLockAcquire(WALWriteLock, LW_SHARED);
6541         result = XLogCtl->Write.lastSegSwitchTime;
6542         LWLockRelease(WALWriteLock);
6543
6544         return result;
6545 }
6546
6547 /*
6548  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
6549  *
6550  * This is exported for use by code that would like to have 64-bit XIDs.
6551  * We don't really support such things, but all XIDs within the system
6552  * can be presumed "close to" the result, and thus the epoch associated
6553  * with them can be determined.
6554  */
6555 void
6556 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
6557 {
6558         uint32          ckptXidEpoch;
6559         TransactionId ckptXid;
6560         TransactionId nextXid;
6561
6562         /* Must read checkpoint info first, else have race condition */
6563         {
6564                 /* use volatile pointer to prevent code rearrangement */
6565                 volatile XLogCtlData *xlogctl = XLogCtl;
6566
6567                 SpinLockAcquire(&xlogctl->info_lck);
6568                 ckptXidEpoch = xlogctl->ckptXidEpoch;
6569                 ckptXid = xlogctl->ckptXid;
6570                 SpinLockRelease(&xlogctl->info_lck);
6571         }
6572
6573         /* Now fetch current nextXid */
6574         nextXid = ReadNewTransactionId();
6575
6576         /*
6577          * nextXid is certainly logically later than ckptXid.  So if it's
6578          * numerically less, it must have wrapped into the next epoch.
6579          */
6580         if (nextXid < ckptXid)
6581                 ckptXidEpoch++;
6582
6583         *xid = nextXid;
6584         *epoch = ckptXidEpoch;
6585 }
6586
6587 /*
6588  * This must be called ONCE during postmaster or standalone-backend shutdown
6589  */
6590 void
6591 ShutdownXLOG(int code, Datum arg)
6592 {
6593         ereport(LOG,
6594                         (errmsg("shutting down")));
6595
6596         if (RecoveryInProgress())
6597                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6598         else
6599         {
6600                 /*
6601                  * If archiving is enabled, rotate the last XLOG file so that all the
6602                  * remaining records are archived (postmaster wakes up the archiver
6603                  * process one more time at the end of shutdown). The checkpoint
6604                  * record will go to the next XLOG file and won't be archived (yet).
6605                  */
6606                 if (XLogArchivingActive() && XLogArchiveCommandSet())
6607                         RequestXLogSwitch();
6608
6609                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6610         }
6611         ShutdownCLOG();
6612         ShutdownSUBTRANS();
6613         ShutdownMultiXact();
6614
6615         ereport(LOG,
6616                         (errmsg("database system is shut down")));
6617 }
6618
6619 /*
6620  * Log start of a checkpoint.
6621  */
6622 static void
6623 LogCheckpointStart(int flags, bool restartpoint)
6624 {
6625         const char *msg;
6626
6627         /*
6628          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
6629          * the main message, but what about all the flags?
6630          */
6631         if (restartpoint)
6632                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
6633         else
6634                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
6635
6636         elog(LOG, msg,
6637                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6638                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6639                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6640                  (flags & CHECKPOINT_FORCE) ? " force" : "",
6641                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
6642                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
6643                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
6644 }
6645
6646 /*
6647  * Log end of a checkpoint.
6648  */
6649 static void
6650 LogCheckpointEnd(bool restartpoint)
6651 {
6652         long            write_secs,
6653                                 sync_secs,
6654                                 total_secs,
6655                                 longest_secs,
6656                                 average_secs;
6657         int                     write_usecs,
6658                                 sync_usecs,
6659                                 total_usecs,
6660                                 longest_usecs,
6661                                 average_usecs;
6662         uint64          average_sync_time;
6663
6664         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
6665
6666         TimestampDifference(CheckpointStats.ckpt_write_t,
6667                                                 CheckpointStats.ckpt_sync_t,
6668                                                 &write_secs, &write_usecs);
6669
6670         TimestampDifference(CheckpointStats.ckpt_sync_t,
6671                                                 CheckpointStats.ckpt_sync_end_t,
6672                                                 &sync_secs, &sync_usecs);
6673
6674         /* Accumulate checkpoint timing summary data, in milliseconds. */
6675         BgWriterStats.m_checkpoint_write_time +=
6676                 write_secs * 1000 + write_usecs / 1000;
6677         BgWriterStats.m_checkpoint_sync_time +=
6678                 sync_secs * 1000 + sync_usecs / 1000;
6679
6680         /*
6681          * All of the published timing statistics are accounted for.  Only
6682          * continue if a log message is to be written.
6683          */
6684         if (!log_checkpoints)
6685                 return;
6686
6687         TimestampDifference(CheckpointStats.ckpt_start_t,
6688                                                 CheckpointStats.ckpt_end_t,
6689                                                 &total_secs, &total_usecs);
6690
6691         /*
6692          * Timing values returned from CheckpointStats are in microseconds.
6693          * Convert to the second plus microsecond form that TimestampDifference
6694          * returns for homogeneous printing.
6695          */
6696         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
6697         longest_usecs = CheckpointStats.ckpt_longest_sync -
6698                 (uint64) longest_secs *1000000;
6699
6700         average_sync_time = 0;
6701         if (CheckpointStats.ckpt_sync_rels > 0)
6702                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
6703                         CheckpointStats.ckpt_sync_rels;
6704         average_secs = (long) (average_sync_time / 1000000);
6705         average_usecs = average_sync_time - (uint64) average_secs *1000000;
6706
6707         if (restartpoint)
6708                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
6709                          "%d transaction log file(s) added, %d removed, %d recycled; "
6710                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
6711                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
6712                          CheckpointStats.ckpt_bufs_written,
6713                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6714                          CheckpointStats.ckpt_segs_added,
6715                          CheckpointStats.ckpt_segs_removed,
6716                          CheckpointStats.ckpt_segs_recycled,
6717                          write_secs, write_usecs / 1000,
6718                          sync_secs, sync_usecs / 1000,
6719                          total_secs, total_usecs / 1000,
6720                          CheckpointStats.ckpt_sync_rels,
6721                          longest_secs, longest_usecs / 1000,
6722                          average_secs, average_usecs / 1000);
6723         else
6724                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
6725                          "%d transaction log file(s) added, %d removed, %d recycled; "
6726                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
6727                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
6728                          CheckpointStats.ckpt_bufs_written,
6729                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6730                          CheckpointStats.ckpt_segs_added,
6731                          CheckpointStats.ckpt_segs_removed,
6732                          CheckpointStats.ckpt_segs_recycled,
6733                          write_secs, write_usecs / 1000,
6734                          sync_secs, sync_usecs / 1000,
6735                          total_secs, total_usecs / 1000,
6736                          CheckpointStats.ckpt_sync_rels,
6737                          longest_secs, longest_usecs / 1000,
6738                          average_secs, average_usecs / 1000);
6739 }
6740
6741 /*
6742  * Perform a checkpoint --- either during shutdown, or on-the-fly
6743  *
6744  * flags is a bitwise OR of the following:
6745  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
6746  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
6747  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
6748  *              ignoring checkpoint_completion_target parameter.
6749  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
6750  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
6751  *              CHECKPOINT_END_OF_RECOVERY).
6752  *
6753  * Note: flags contains other bits, of interest here only for logging purposes.
6754  * In particular note that this routine is synchronous and does not pay
6755  * attention to CHECKPOINT_WAIT.
6756  *
6757  * If !shutdown then we are writing an online checkpoint. This is a very special
6758  * kind of operation and WAL record because the checkpoint action occurs over
6759  * a period of time yet logically occurs at just a single LSN. The logical
6760  * position of the WAL record (redo ptr) is the same or earlier than the
6761  * physical position. When we replay WAL we locate the checkpoint via its
6762  * physical position then read the redo ptr and actually start replay at the
6763  * earlier logical position. Note that we don't write *anything* to WAL at
6764  * the logical position, so that location could be any other kind of WAL record.
6765  * All of this mechanism allows us to continue working while we checkpoint.
6766  * As a result, timing of actions is critical here and be careful to note that
6767  * this function will likely take minutes to execute on a busy system.
6768  */
6769 void
6770 CreateCheckPoint(int flags)
6771 {
6772         bool            shutdown;
6773         CheckPoint      checkPoint;
6774         XLogRecPtr      recptr;
6775         XLogCtlInsert *Insert = &XLogCtl->Insert;
6776         XLogRecData rdata;
6777         uint32          freespace;
6778         XLogSegNo       _logSegNo;
6779         VirtualTransactionId *vxids;
6780         int                     nvxids;
6781
6782         /*
6783          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
6784          * issued at a different time.
6785          */
6786         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
6787                 shutdown = true;
6788         else
6789                 shutdown = false;
6790
6791         /* sanity check */
6792         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
6793                 elog(ERROR, "can't create a checkpoint during recovery");
6794
6795         /*
6796          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
6797          * (This is just pro forma, since in the present system structure there is
6798          * only one process that is allowed to issue checkpoints at any given
6799          * time.)
6800          */
6801         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
6802
6803         /*
6804          * Prepare to accumulate statistics.
6805          *
6806          * Note: because it is possible for log_checkpoints to change while a
6807          * checkpoint proceeds, we always accumulate stats, even if
6808          * log_checkpoints is currently off.
6809          */
6810         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
6811         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
6812
6813         /*
6814          * Use a critical section to force system panic if we have trouble.
6815          */
6816         START_CRIT_SECTION();
6817
6818         if (shutdown)
6819         {
6820                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6821                 ControlFile->state = DB_SHUTDOWNING;
6822                 ControlFile->time = (pg_time_t) time(NULL);
6823                 UpdateControlFile();
6824                 LWLockRelease(ControlFileLock);
6825         }
6826
6827         /*
6828          * Let smgr prepare for checkpoint; this has to happen before we determine
6829          * the REDO pointer.  Note that smgr must not do anything that'd have to
6830          * be undone if we decide no checkpoint is needed.
6831          */
6832         smgrpreckpt();
6833
6834         /* Begin filling in the checkpoint WAL record */
6835         MemSet(&checkPoint, 0, sizeof(checkPoint));
6836         checkPoint.time = (pg_time_t) time(NULL);
6837
6838         /*
6839          * For Hot Standby, derive the oldestActiveXid before we fix the redo
6840          * pointer. This allows us to begin accumulating changes to assemble our
6841          * starting snapshot of locks and transactions.
6842          */
6843         if (!shutdown && XLogStandbyInfoActive())
6844                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
6845         else
6846                 checkPoint.oldestActiveXid = InvalidTransactionId;
6847
6848         /*
6849          * We must hold WALInsertLock while examining insert state to determine
6850          * the checkpoint REDO pointer.
6851          */
6852         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6853
6854         /*
6855          * If this isn't a shutdown or forced checkpoint, and we have not inserted
6856          * any XLOG records since the start of the last checkpoint, skip the
6857          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
6858          * when the system is idle. That wastes log space, and more importantly it
6859          * exposes us to possible loss of both current and previous checkpoint
6860          * records if the machine crashes just as we're writing the update.
6861          * (Perhaps it'd make even more sense to checkpoint only when the previous
6862          * checkpoint record is in a different xlog page?)
6863          *
6864          * We have to make two tests to determine that nothing has happened since
6865          * the start of the last checkpoint: current insertion point must match
6866          * the end of the last checkpoint record, and its redo pointer must point
6867          * to itself.
6868          */
6869         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
6870                                   CHECKPOINT_FORCE)) == 0)
6871         {
6872                 XLogRecPtr      curInsert;
6873
6874                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
6875                 if (curInsert == ControlFile->checkPoint +
6876                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
6877                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
6878                 {
6879                         LWLockRelease(WALInsertLock);
6880                         LWLockRelease(CheckpointLock);
6881                         END_CRIT_SECTION();
6882                         return;
6883                 }
6884         }
6885
6886         /*
6887          * An end-of-recovery checkpoint is created before anyone is allowed to
6888          * write WAL. To allow us to write the checkpoint record, temporarily
6889          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
6890          * initialized, which we need here and in AdvanceXLInsertBuffer.)
6891          */
6892         if (flags & CHECKPOINT_END_OF_RECOVERY)
6893                 LocalSetXLogInsertAllowed();
6894
6895         checkPoint.ThisTimeLineID = ThisTimeLineID;
6896         if (flags & CHECKPOINT_END_OF_RECOVERY)
6897                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
6898         else
6899                 checkPoint.PrevTimeLineID = ThisTimeLineID;
6900
6901         checkPoint.fullPageWrites = Insert->fullPageWrites;
6902
6903         /*
6904          * Compute new REDO record ptr = location of next XLOG record.
6905          *
6906          * NB: this is NOT necessarily where the checkpoint record itself will be,
6907          * since other backends may insert more XLOG records while we're off doing
6908          * the buffer flush work.  Those XLOG records are logically after the
6909          * checkpoint, even though physically before it.  Got that?
6910          */
6911         freespace = INSERT_FREESPACE(Insert);
6912         if (freespace == 0)
6913         {
6914                 (void) AdvanceXLInsertBuffer(false);
6915                 /* OK to ignore update return flag, since we will do flush anyway */
6916                 freespace = INSERT_FREESPACE(Insert);
6917         }
6918         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
6919
6920         /*
6921          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
6922          * must be done while holding the insert lock AND the info_lck.
6923          *
6924          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
6925          * pointing past where it really needs to point.  This is okay; the only
6926          * consequence is that XLogInsert might back up whole buffers that it
6927          * didn't really need to.  We can't postpone advancing RedoRecPtr because
6928          * XLogInserts that happen while we are dumping buffers must assume that
6929          * their buffer changes are not included in the checkpoint.
6930          */
6931         {
6932                 /* use volatile pointer to prevent code rearrangement */
6933                 volatile XLogCtlData *xlogctl = XLogCtl;
6934
6935                 SpinLockAcquire(&xlogctl->info_lck);
6936                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
6937                 SpinLockRelease(&xlogctl->info_lck);
6938         }
6939
6940         /*
6941          * Now we can release WAL insert lock, allowing other xacts to proceed
6942          * while we are flushing disk buffers.
6943          */
6944         LWLockRelease(WALInsertLock);
6945
6946         /*
6947          * If enabled, log checkpoint start.  We postpone this until now so as not
6948          * to log anything if we decided to skip the checkpoint.
6949          */
6950         if (log_checkpoints)
6951                 LogCheckpointStart(flags, false);
6952
6953         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
6954
6955         /*
6956          * In some cases there are groups of actions that must all occur on one
6957          * side or the other of a checkpoint record. Before flushing the
6958          * checkpoint record we must explicitly wait for any backend currently
6959          * performing those groups of actions.
6960          *
6961          * One example is end of transaction, so we must wait for any transactions
6962          * that are currently in commit critical sections.      If an xact inserted
6963          * its commit record into XLOG just before the REDO point, then a crash
6964          * restart from the REDO point would not replay that record, which means
6965          * that our flushing had better include the xact's update of pg_clog.  So
6966          * we wait till he's out of his commit critical section before proceeding.
6967          * See notes in RecordTransactionCommit().
6968          *
6969          * Because we've already released WALInsertLock, this test is a bit fuzzy:
6970          * it is possible that we will wait for xacts we didn't really need to
6971          * wait for.  But the delay should be short and it seems better to make
6972          * checkpoint take a bit longer than to hold locks longer than necessary.
6973          * (In fact, the whole reason we have this issue is that xact.c does
6974          * commit record XLOG insertion and clog update as two separate steps
6975          * protected by different locks, but again that seems best on grounds of
6976          * minimizing lock contention.)
6977          *
6978          * A transaction that has not yet set delayChkpt when we look cannot be at
6979          * risk, since he's not inserted his commit record yet; and one that's
6980          * already cleared it is not at risk either, since he's done fixing clog
6981          * and we will correctly flush the update below.  So we cannot miss any
6982          * xacts we need to wait for.
6983          */
6984         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
6985         if (nvxids > 0)
6986         {
6987                 uint32          nwaits = 0;
6988
6989                 do
6990                 {
6991                         pg_usleep(10000L);      /* wait for 10 msec */
6992                         nwaits++;
6993                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
6994         }
6995         pfree(vxids);
6996
6997         /*
6998          * Get the other info we need for the checkpoint record.
6999          */
7000         LWLockAcquire(XidGenLock, LW_SHARED);
7001         checkPoint.nextXid = ShmemVariableCache->nextXid;
7002         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
7003         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
7004         LWLockRelease(XidGenLock);
7005
7006         /* Increase XID epoch if we've wrapped around since last checkpoint */
7007         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
7008         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
7009                 checkPoint.nextXidEpoch++;
7010
7011         LWLockAcquire(OidGenLock, LW_SHARED);
7012         checkPoint.nextOid = ShmemVariableCache->nextOid;
7013         if (!shutdown)
7014                 checkPoint.nextOid += ShmemVariableCache->oidCount;
7015         LWLockRelease(OidGenLock);
7016
7017         MultiXactGetCheckptMulti(shutdown,
7018                                                          &checkPoint.nextMulti,
7019                                                          &checkPoint.nextMultiOffset,
7020                                                          &checkPoint.oldestMulti,
7021                                                          &checkPoint.oldestMultiDB);
7022
7023         /*
7024          * Having constructed the checkpoint record, ensure all shmem disk buffers
7025          * and commit-log buffers are flushed to disk.
7026          *
7027          * This I/O could fail for various reasons.  If so, we will fail to
7028          * complete the checkpoint, but there is no reason to force a system
7029          * panic. Accordingly, exit critical section while doing it.
7030          */
7031         END_CRIT_SECTION();
7032
7033         CheckPointGuts(checkPoint.redo, flags);
7034
7035         /*
7036          * Take a snapshot of running transactions and write this to WAL. This
7037          * allows us to reconstruct the state of running transactions during
7038          * archive recovery, if required. Skip, if this info disabled.
7039          *
7040          * If we are shutting down, or Startup process is completing crash
7041          * recovery we don't need to write running xact data.
7042          */
7043         if (!shutdown && XLogStandbyInfoActive())
7044                 LogStandbySnapshot();
7045
7046         START_CRIT_SECTION();
7047
7048         /*
7049          * Now insert the checkpoint record into XLOG.
7050          */
7051         rdata.data = (char *) (&checkPoint);
7052         rdata.len = sizeof(checkPoint);
7053         rdata.buffer = InvalidBuffer;
7054         rdata.next = NULL;
7055
7056         recptr = XLogInsert(RM_XLOG_ID,
7057                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7058                                                 XLOG_CHECKPOINT_ONLINE,
7059                                                 &rdata);
7060
7061         XLogFlush(recptr);
7062
7063         /*
7064          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7065          * overwritten at next startup.  No-one should even try, this just allows
7066          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
7067          * to just temporarily disable writing until the system has exited
7068          * recovery.
7069          */
7070         if (shutdown)
7071         {
7072                 if (flags & CHECKPOINT_END_OF_RECOVERY)
7073                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
7074                 else
7075                         LocalXLogInsertAllowed = 0; /* never again write WAL */
7076         }
7077
7078         /*
7079          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7080          * = end of actual checkpoint record.
7081          */
7082         if (shutdown && checkPoint.redo != ProcLastRecPtr)
7083                 ereport(PANIC,
7084                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
7085
7086         /*
7087          * Select point at which we can truncate the log, which we base on the
7088          * prior checkpoint's earliest info.
7089          */
7090         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
7091
7092         /*
7093          * Update the control file.
7094          */
7095         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7096         if (shutdown)
7097                 ControlFile->state = DB_SHUTDOWNED;
7098         ControlFile->prevCheckPoint = ControlFile->checkPoint;
7099         ControlFile->checkPoint = ProcLastRecPtr;
7100         ControlFile->checkPointCopy = checkPoint;
7101         ControlFile->time = (pg_time_t) time(NULL);
7102         /* crash recovery should always recover to the end of WAL */
7103         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
7104         ControlFile->minRecoveryPointTLI = 0;
7105
7106         /*
7107          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
7108          * unused on non-shutdown checkpoints, but seems useful to store it always
7109          * for debugging purposes.
7110          */
7111         SpinLockAcquire(&XLogCtl->ulsn_lck);
7112         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
7113         SpinLockRelease(&XLogCtl->ulsn_lck);
7114
7115         UpdateControlFile();
7116         LWLockRelease(ControlFileLock);
7117
7118         /* Update shared-memory copy of checkpoint XID/epoch */
7119         {
7120                 /* use volatile pointer to prevent code rearrangement */
7121                 volatile XLogCtlData *xlogctl = XLogCtl;
7122
7123                 SpinLockAcquire(&xlogctl->info_lck);
7124                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
7125                 xlogctl->ckptXid = checkPoint.nextXid;
7126                 SpinLockRelease(&xlogctl->info_lck);
7127         }
7128
7129         /*
7130          * We are now done with critical updates; no need for system panic if we
7131          * have trouble while fooling with old log segments.
7132          */
7133         END_CRIT_SECTION();
7134
7135         /*
7136          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7137          */
7138         smgrpostckpt();
7139
7140         /*
7141          * Delete old log files (those no longer needed even for previous
7142          * checkpoint or the standbys in XLOG streaming).
7143          */
7144         if (_logSegNo)
7145         {
7146                 KeepLogSeg(recptr, &_logSegNo);
7147                 _logSegNo--;
7148                 RemoveOldXlogFiles(_logSegNo, recptr);
7149         }
7150
7151         /*
7152          * Make more log segments if needed.  (Do this after recycling old log
7153          * segments, since that may supply some of the needed files.)
7154          */
7155         if (!shutdown)
7156                 PreallocXlogFiles(recptr);
7157
7158         /*
7159          * Truncate pg_subtrans if possible.  We can throw away all data before
7160          * the oldest XMIN of any running transaction.  No future transaction will
7161          * attempt to reference any pg_subtrans entry older than that (see Asserts
7162          * in subtrans.c).      During recovery, though, we mustn't do this because
7163          * StartupSUBTRANS hasn't been called yet.
7164          */
7165         if (!RecoveryInProgress())
7166                 TruncateSUBTRANS(GetOldestXmin(true, false));
7167
7168         /* Real work is done, but log and update stats before releasing lock. */
7169         LogCheckpointEnd(false);
7170
7171         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
7172                                                                          NBuffers,
7173                                                                          CheckpointStats.ckpt_segs_added,
7174                                                                          CheckpointStats.ckpt_segs_removed,
7175                                                                          CheckpointStats.ckpt_segs_recycled);
7176
7177         LWLockRelease(CheckpointLock);
7178 }
7179
7180 /*
7181  * Mark the end of recovery in WAL though without running a full checkpoint.
7182  * We can expect that a restartpoint is likely to be in progress as we
7183  * do this, though we are unwilling to wait for it to complete. So be
7184  * careful to avoid taking the CheckpointLock anywhere here.
7185  *
7186  * CreateRestartPoint() allows for the case where recovery may end before
7187  * the restartpoint completes so there is no concern of concurrent behaviour.
7188  */
7189 void
7190 CreateEndOfRecoveryRecord(void)
7191 {
7192         xl_end_of_recovery xlrec;
7193         XLogRecData rdata;
7194         XLogRecPtr      recptr;
7195
7196         /* sanity check */
7197         if (!RecoveryInProgress())
7198                 elog(ERROR, "can only be used to end recovery");
7199
7200         xlrec.end_time = time(NULL);
7201
7202         LWLockAcquire(WALInsertLock, LW_SHARED);
7203         xlrec.ThisTimeLineID = ThisTimeLineID;
7204         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7205         LWLockRelease(WALInsertLock);
7206
7207         LocalSetXLogInsertAllowed();
7208
7209         START_CRIT_SECTION();
7210
7211         rdata.data = (char *) &xlrec;
7212         rdata.len = sizeof(xl_end_of_recovery);
7213         rdata.buffer = InvalidBuffer;
7214         rdata.next = NULL;
7215
7216         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
7217
7218         XLogFlush(recptr);
7219
7220         /*
7221          * Update the control file so that crash recovery can follow the timeline
7222          * changes to this point.
7223          */
7224         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7225         ControlFile->time = (pg_time_t) xlrec.end_time;
7226         ControlFile->minRecoveryPoint = recptr;
7227         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
7228         UpdateControlFile();
7229         LWLockRelease(ControlFileLock);
7230
7231         END_CRIT_SECTION();
7232
7233         LocalXLogInsertAllowed = -1;    /* return to "check" state */
7234 }
7235
7236 /*
7237  * Flush all data in shared memory to disk, and fsync
7238  *
7239  * This is the common code shared between regular checkpoints and
7240  * recovery restartpoints.
7241  */
7242 static void
7243 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7244 {
7245         CheckPointCLOG();
7246         CheckPointSUBTRANS();
7247         CheckPointMultiXact();
7248         CheckPointPredicate();
7249         CheckPointRelationMap();
7250         CheckPointBuffers(flags);       /* performs all required fsyncs */
7251         /* We deliberately delay 2PC checkpointing as long as possible */
7252         CheckPointTwoPhase(checkPointRedo);
7253 }
7254
7255 /*
7256  * Save a checkpoint for recovery restart if appropriate
7257  *
7258  * This function is called each time a checkpoint record is read from XLOG.
7259  * It must determine whether the checkpoint represents a safe restartpoint or
7260  * not.  If so, the checkpoint record is stashed in shared memory so that
7261  * CreateRestartPoint can consult it.  (Note that the latter function is
7262  * executed by the checkpointer, while this one will be executed by the
7263  * startup process.)
7264  */
7265 static void
7266 RecoveryRestartPoint(const CheckPoint *checkPoint)
7267 {
7268         int                     rmid;
7269
7270         /* use volatile pointer to prevent code rearrangement */
7271         volatile XLogCtlData *xlogctl = XLogCtl;
7272
7273         /*
7274          * Is it safe to restartpoint?  We must ask each of the resource managers
7275          * whether they have any partial state information that might prevent a
7276          * correct restart from this point.  If so, we skip this opportunity, but
7277          * return at the next checkpoint record for another try.
7278          */
7279         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7280         {
7281                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
7282                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
7283                         {
7284                                 elog(trace_recovery(DEBUG2),
7285                                          "RM %d not safe to record restart point at %X/%X",
7286                                          rmid,
7287                                          (uint32) (checkPoint->redo >> 32),
7288                                          (uint32) checkPoint->redo);
7289                                 return;
7290                         }
7291         }
7292
7293         /*
7294          * Also refrain from creating a restartpoint if we have seen any
7295          * references to non-existent pages. Restarting recovery from the
7296          * restartpoint would not see the references, so we would lose the
7297          * cross-check that the pages belonged to a relation that was dropped
7298          * later.
7299          */
7300         if (XLogHaveInvalidPages())
7301         {
7302                 elog(trace_recovery(DEBUG2),
7303                          "could not record restart point at %X/%X because there "
7304                          "are unresolved references to invalid pages",
7305                          (uint32) (checkPoint->redo >> 32),
7306                          (uint32) checkPoint->redo);
7307                 return;
7308         }
7309
7310         /*
7311          * Copy the checkpoint record to shared memory, so that checkpointer can
7312          * work out the next time it wants to perform a restartpoint.
7313          */
7314         SpinLockAcquire(&xlogctl->info_lck);
7315         xlogctl->lastCheckPointRecPtr = ReadRecPtr;
7316         xlogctl->lastCheckPoint = *checkPoint;
7317         SpinLockRelease(&xlogctl->info_lck);
7318 }
7319
7320 /*
7321  * Establish a restartpoint if possible.
7322  *
7323  * This is similar to CreateCheckPoint, but is used during WAL recovery
7324  * to establish a point from which recovery can roll forward without
7325  * replaying the entire recovery log.
7326  *
7327  * Returns true if a new restartpoint was established. We can only establish
7328  * a restartpoint if we have replayed a safe checkpoint record since last
7329  * restartpoint.
7330  */
7331 bool
7332 CreateRestartPoint(int flags)
7333 {
7334         XLogRecPtr      lastCheckPointRecPtr;
7335         CheckPoint      lastCheckPoint;
7336         XLogSegNo       _logSegNo;
7337         TimestampTz xtime;
7338
7339         /* use volatile pointer to prevent code rearrangement */
7340         volatile XLogCtlData *xlogctl = XLogCtl;
7341
7342         /*
7343          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
7344          * happens at a time.
7345          */
7346         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7347
7348         /* Get a local copy of the last safe checkpoint record. */
7349         SpinLockAcquire(&xlogctl->info_lck);
7350         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
7351         lastCheckPoint = xlogctl->lastCheckPoint;
7352         SpinLockRelease(&xlogctl->info_lck);
7353
7354         /*
7355          * Check that we're still in recovery mode. It's ok if we exit recovery
7356          * mode after this check, the restart point is valid anyway.
7357          */
7358         if (!RecoveryInProgress())
7359         {
7360                 ereport(DEBUG2,
7361                           (errmsg("skipping restartpoint, recovery has already ended")));
7362                 LWLockRelease(CheckpointLock);
7363                 return false;
7364         }
7365
7366         /*
7367          * If the last checkpoint record we've replayed is already our last
7368          * restartpoint, we can't perform a new restart point. We still update
7369          * minRecoveryPoint in that case, so that if this is a shutdown restart
7370          * point, we won't start up earlier than before. That's not strictly
7371          * necessary, but when hot standby is enabled, it would be rather weird if
7372          * the database opened up for read-only connections at a point-in-time
7373          * before the last shutdown. Such time travel is still possible in case of
7374          * immediate shutdown, though.
7375          *
7376          * We don't explicitly advance minRecoveryPoint when we do create a
7377          * restartpoint. It's assumed that flushing the buffers will do that as a
7378          * side-effect.
7379          */
7380         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
7381                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
7382         {
7383                 ereport(DEBUG2,
7384                                 (errmsg("skipping restartpoint, already performed at %X/%X",
7385                                                 (uint32) (lastCheckPoint.redo >> 32),
7386                                                 (uint32) lastCheckPoint.redo)));
7387
7388                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
7389                 if (flags & CHECKPOINT_IS_SHUTDOWN)
7390                 {
7391                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7392                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7393                         ControlFile->time = (pg_time_t) time(NULL);
7394                         UpdateControlFile();
7395                         LWLockRelease(ControlFileLock);
7396                 }
7397                 LWLockRelease(CheckpointLock);
7398                 return false;
7399         }
7400
7401         /*
7402          * Update the shared RedoRecPtr so that the startup process can calculate
7403          * the number of segments replayed since last restartpoint, and request a
7404          * restartpoint if it exceeds checkpoint_segments.
7405          *
7406          * You need to hold WALInsertLock and info_lck to update it, although
7407          * during recovery acquiring WALInsertLock is just pro forma, because
7408          * there is no other processes updating Insert.RedoRecPtr.
7409          */
7410         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7411         SpinLockAcquire(&xlogctl->info_lck);
7412         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
7413         SpinLockRelease(&xlogctl->info_lck);
7414         LWLockRelease(WALInsertLock);
7415
7416         /*
7417          * Prepare to accumulate statistics.
7418          *
7419          * Note: because it is possible for log_checkpoints to change while a
7420          * checkpoint proceeds, we always accumulate stats, even if
7421          * log_checkpoints is currently off.
7422          */
7423         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7424         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7425
7426         if (log_checkpoints)
7427                 LogCheckpointStart(flags, true);
7428
7429         CheckPointGuts(lastCheckPoint.redo, flags);
7430
7431         /*
7432          * Select point at which we can truncate the xlog, which we base on the
7433          * prior checkpoint's earliest info.
7434          */
7435         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
7436
7437         /*
7438          * Update pg_control, using current time.  Check that it still shows
7439          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
7440          * this is a quick hack to make sure nothing really bad happens if somehow
7441          * we get here after the end-of-recovery checkpoint.
7442          */
7443         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7444         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
7445                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
7446         {
7447                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
7448                 ControlFile->checkPoint = lastCheckPointRecPtr;
7449                 ControlFile->checkPointCopy = lastCheckPoint;
7450                 ControlFile->time = (pg_time_t) time(NULL);
7451                 if (flags & CHECKPOINT_IS_SHUTDOWN)
7452                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7453                 UpdateControlFile();
7454         }
7455         LWLockRelease(ControlFileLock);
7456
7457         /*
7458          * Delete old log files (those no longer needed even for previous
7459          * checkpoint/restartpoint) to prevent the disk holding the xlog from
7460          * growing full.
7461          */
7462         if (_logSegNo)
7463         {
7464                 XLogRecPtr      receivePtr;
7465                 XLogRecPtr      replayPtr;
7466                 TimeLineID      replayTLI;
7467                 XLogRecPtr      endptr;
7468
7469                 /*
7470                  * Get the current end of xlog replayed or received, whichever is
7471                  * later.
7472                  */
7473                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
7474                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
7475                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
7476
7477                 KeepLogSeg(endptr, &_logSegNo);
7478                 _logSegNo--;
7479
7480                 /*
7481                  * Try to recycle segments on a useful timeline. If we've been promoted
7482                  * since the beginning of this restartpoint, use the new timeline
7483                  * chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
7484                  * in that case). If we're still in recovery, use the timeline we're
7485                  * currently replaying.
7486                  *
7487                  * There is no guarantee that the WAL segments will be useful on the
7488                  * current timeline; if recovery proceeds to a new timeline right
7489                  * after this, the pre-allocated WAL segments on this timeline will
7490                  * not be used, and will go wasted until recycled on the next
7491                  * restartpoint. We'll live with that.
7492                  */
7493                 if (RecoveryInProgress())
7494                         ThisTimeLineID = replayTLI;
7495
7496                 RemoveOldXlogFiles(_logSegNo, endptr);
7497
7498                 /*
7499                  * Make more log segments if needed.  (Do this after recycling old log
7500                  * segments, since that may supply some of the needed files.)
7501                  */
7502                 PreallocXlogFiles(endptr);
7503
7504                 /*
7505                  * ThisTimeLineID is normally not set when we're still in recovery.
7506                  * However, recycling/preallocating segments above needed
7507                  * ThisTimeLineID to determine which timeline to install the segments
7508                  * on. Reset it now, to restore the normal state of affairs for
7509                  * debugging purposes.
7510                  */
7511                 if (RecoveryInProgress())
7512                         ThisTimeLineID = 0;
7513         }
7514
7515         /*
7516          * Truncate pg_subtrans if possible.  We can throw away all data before
7517          * the oldest XMIN of any running transaction.  No future transaction will
7518          * attempt to reference any pg_subtrans entry older than that (see Asserts
7519          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
7520          * this because StartupSUBTRANS hasn't been called yet.
7521          */
7522         if (EnableHotStandby)
7523                 TruncateSUBTRANS(GetOldestXmin(true, false));
7524
7525         /* Real work is done, but log and update before releasing lock. */
7526         LogCheckpointEnd(true);
7527
7528         xtime = GetLatestXTime();
7529         ereport((log_checkpoints ? LOG : DEBUG2),
7530                         (errmsg("recovery restart point at %X/%X",
7531                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
7532                    xtime ? errdetail("last completed transaction was at log time %s",
7533                                                          timestamptz_to_str(xtime)) : 0));
7534
7535         LWLockRelease(CheckpointLock);
7536
7537         /*
7538          * Finally, execute archive_cleanup_command, if any.
7539          */
7540         if (XLogCtl->archiveCleanupCommand[0])
7541                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
7542                                                            "archive_cleanup_command",
7543                                                            false);
7544
7545         return true;
7546 }
7547
7548 /*
7549  * Retreat *logSegNo to the last segment that we need to retain because of
7550  * wal_keep_segments. This is calculated by subtracting wal_keep_segments
7551  * from the given xlog location, recptr.
7552  */
7553 static void
7554 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
7555 {
7556         XLogSegNo       segno;
7557
7558         if (wal_keep_segments == 0)
7559                 return;
7560
7561         XLByteToSeg(recptr, segno);
7562
7563         /* avoid underflow, don't go below 1 */
7564         if (segno <= wal_keep_segments)
7565                 segno = 1;
7566         else
7567                 segno = segno - wal_keep_segments;
7568
7569         /* don't delete WAL segments newer than the calculated segment */
7570         if (segno < *logSegNo)
7571                 *logSegNo = segno;
7572 }
7573
7574 /*
7575  * Write a NEXTOID log record
7576  */
7577 void
7578 XLogPutNextOid(Oid nextOid)
7579 {
7580         XLogRecData rdata;
7581
7582         rdata.data = (char *) (&nextOid);
7583         rdata.len = sizeof(Oid);
7584         rdata.buffer = InvalidBuffer;
7585         rdata.next = NULL;
7586         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
7587
7588         /*
7589          * We need not flush the NEXTOID record immediately, because any of the
7590          * just-allocated OIDs could only reach disk as part of a tuple insert or
7591          * update that would have its own XLOG record that must follow the NEXTOID
7592          * record.      Therefore, the standard buffer LSN interlock applied to those
7593          * records will ensure no such OID reaches disk before the NEXTOID record
7594          * does.
7595          *
7596          * Note, however, that the above statement only covers state "within" the
7597          * database.  When we use a generated OID as a file or directory name, we
7598          * are in a sense violating the basic WAL rule, because that filesystem
7599          * change may reach disk before the NEXTOID WAL record does.  The impact
7600          * of this is that if a database crash occurs immediately afterward, we
7601          * might after restart re-generate the same OID and find that it conflicts
7602          * with the leftover file or directory.  But since for safety's sake we
7603          * always loop until finding a nonconflicting filename, this poses no real
7604          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
7605          */
7606 }
7607
7608 /*
7609  * Write an XLOG SWITCH record.
7610  *
7611  * Here we just blindly issue an XLogInsert request for the record.
7612  * All the magic happens inside XLogInsert.
7613  *
7614  * The return value is either the end+1 address of the switch record,
7615  * or the end+1 address of the prior segment if we did not need to
7616  * write a switch record because we are already at segment start.
7617  */
7618 XLogRecPtr
7619 RequestXLogSwitch(void)
7620 {
7621         XLogRecPtr      RecPtr;
7622         XLogRecData rdata;
7623
7624         /* XLOG SWITCH, alone among xlog record types, has no data */
7625         rdata.buffer = InvalidBuffer;
7626         rdata.data = NULL;
7627         rdata.len = 0;
7628         rdata.next = NULL;
7629
7630         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
7631
7632         return RecPtr;
7633 }
7634
7635 /*
7636  * Write a RESTORE POINT record
7637  */
7638 XLogRecPtr
7639 XLogRestorePoint(const char *rpName)
7640 {
7641         XLogRecPtr      RecPtr;
7642         XLogRecData rdata;
7643         xl_restore_point xlrec;
7644
7645         xlrec.rp_time = GetCurrentTimestamp();
7646         strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
7647
7648         rdata.buffer = InvalidBuffer;
7649         rdata.data = (char *) &xlrec;
7650         rdata.len = sizeof(xl_restore_point);
7651         rdata.next = NULL;
7652
7653         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
7654
7655         ereport(LOG,
7656                         (errmsg("restore point \"%s\" created at %X/%X",
7657                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
7658
7659         return RecPtr;
7660 }
7661
7662 /*
7663  * Write a backup block if needed when we are setting a hint. Note that
7664  * this may be called for a variety of page types, not just heaps.
7665  *
7666  * Callable while holding just share lock on the buffer content.
7667  *
7668  * We can't use the plain backup block mechanism since that relies on the
7669  * Buffer being exclusively locked. Since some modifications (setting LSN, hint
7670  * bits) are allowed in a sharelocked buffer that can lead to wal checksum
7671  * failures. So instead we copy the page and insert the copied data as normal
7672  * record data.
7673  *
7674  * We only need to do something if page has not yet been full page written in
7675  * this checkpoint round. The LSN of the inserted wal record is returned if we
7676  * had to write, InvalidXLogRecPtr otherwise.
7677  *
7678  * It is possible that multiple concurrent backends could attempt to write WAL
7679  * records. In that case, multiple copies of the same block would be recorded
7680  * in separate WAL records by different backends, though that is still OK from
7681  * a correctness perspective.
7682  *
7683  * Note that this only works for buffers that fit the standard page model,
7684  * i.e. those for which buffer_std == true
7685  */
7686 XLogRecPtr
7687 XLogSaveBufferForHint(Buffer buffer)
7688 {
7689         XLogRecPtr      recptr = InvalidXLogRecPtr;
7690         XLogRecPtr      lsn;
7691         XLogRecData rdata[2];
7692         BkpBlock        bkpb;
7693
7694         /*
7695          * Ensure no checkpoint can change our view of RedoRecPtr.
7696          */
7697         Assert(MyPgXact->delayChkpt);
7698
7699         /*
7700          * Update RedoRecPtr so XLogCheckBuffer can make the right decision
7701          */
7702         GetRedoRecPtr();
7703
7704         /*
7705          * Setup phony rdata element for use within XLogCheckBuffer only. We reuse
7706          * and reset rdata for any actual WAL record insert.
7707          */
7708         rdata[0].buffer = buffer;
7709         rdata[0].buffer_std = true;
7710
7711         /*
7712          * Check buffer while not holding an exclusive lock.
7713          */
7714         if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
7715         {
7716                 char            copied_buffer[BLCKSZ];
7717                 char       *origdata = (char *) BufferGetBlock(buffer);
7718
7719                 /*
7720                  * Copy buffer so we don't have to worry about concurrent hint bit or
7721                  * lsn updates. We assume pd_lower/upper cannot be changed without an
7722                  * exclusive lock, so the contents bkp are not racy.
7723                  */
7724                 memcpy(copied_buffer, origdata, bkpb.hole_offset);
7725                 memcpy(copied_buffer + bkpb.hole_offset,
7726                            origdata + bkpb.hole_offset + bkpb.hole_length,
7727                            BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
7728
7729                 /*
7730                  * Header for backup block.
7731                  */
7732                 rdata[0].data = (char *) &bkpb;
7733                 rdata[0].len = sizeof(BkpBlock);
7734                 rdata[0].buffer = InvalidBuffer;
7735                 rdata[0].next = &(rdata[1]);
7736
7737                 /*
7738                  * Save copy of the buffer.
7739                  */
7740                 rdata[1].data = copied_buffer;
7741                 rdata[1].len = BLCKSZ - bkpb.hole_length;
7742                 rdata[1].buffer = InvalidBuffer;
7743                 rdata[1].next = NULL;
7744
7745                 recptr = XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
7746         }
7747
7748         return recptr;
7749 }
7750
7751 /*
7752  * Check if any of the GUC parameters that are critical for hot standby
7753  * have changed, and update the value in pg_control file if necessary.
7754  */
7755 static void
7756 XLogReportParameters(void)
7757 {
7758         if (wal_level != ControlFile->wal_level ||
7759                 MaxConnections != ControlFile->MaxConnections ||
7760                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
7761                 max_locks_per_xact != ControlFile->max_locks_per_xact)
7762         {
7763                 /*
7764                  * The change in number of backend slots doesn't need to be WAL-logged
7765                  * if archiving is not enabled, as you can't start archive recovery
7766                  * with wal_level=minimal anyway. We don't really care about the
7767                  * values in pg_control either if wal_level=minimal, but seems better
7768                  * to keep them up-to-date to avoid confusion.
7769                  */
7770                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
7771                 {
7772                         XLogRecData rdata;
7773                         xl_parameter_change xlrec;
7774
7775                         xlrec.MaxConnections = MaxConnections;
7776                         xlrec.max_prepared_xacts = max_prepared_xacts;
7777                         xlrec.max_locks_per_xact = max_locks_per_xact;
7778                         xlrec.wal_level = wal_level;
7779
7780                         rdata.buffer = InvalidBuffer;
7781                         rdata.data = (char *) &xlrec;
7782                         rdata.len = sizeof(xlrec);
7783                         rdata.next = NULL;
7784
7785                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
7786                 }
7787
7788                 ControlFile->MaxConnections = MaxConnections;
7789                 ControlFile->max_prepared_xacts = max_prepared_xacts;
7790                 ControlFile->max_locks_per_xact = max_locks_per_xact;
7791                 ControlFile->wal_level = wal_level;
7792                 UpdateControlFile();
7793         }
7794 }
7795
7796 /*
7797  * Update full_page_writes in shared memory, and write an
7798  * XLOG_FPW_CHANGE record if necessary.
7799  *
7800  * Note: this function assumes there is no other process running
7801  * concurrently that could update it.
7802  */
7803 void
7804 UpdateFullPageWrites(void)
7805 {
7806         XLogCtlInsert *Insert = &XLogCtl->Insert;
7807
7808         /*
7809          * Do nothing if full_page_writes has not been changed.
7810          *
7811          * It's safe to check the shared full_page_writes without the lock,
7812          * because we assume that there is no concurrently running process which
7813          * can update it.
7814          */
7815         if (fullPageWrites == Insert->fullPageWrites)
7816                 return;
7817
7818         START_CRIT_SECTION();
7819
7820         /*
7821          * It's always safe to take full page images, even when not strictly
7822          * required, but not the other round. So if we're setting full_page_writes
7823          * to true, first set it true and then write the WAL record. If we're
7824          * setting it to false, first write the WAL record and then set the global
7825          * flag.
7826          */
7827         if (fullPageWrites)
7828         {
7829                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7830                 Insert->fullPageWrites = true;
7831                 LWLockRelease(WALInsertLock);
7832         }
7833
7834         /*
7835          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
7836          * full_page_writes during archive recovery, if required.
7837          */
7838         if (XLogStandbyInfoActive() && !RecoveryInProgress())
7839         {
7840                 XLogRecData rdata;
7841
7842                 rdata.data = (char *) (&fullPageWrites);
7843                 rdata.len = sizeof(bool);
7844                 rdata.buffer = InvalidBuffer;
7845                 rdata.next = NULL;
7846
7847                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
7848         }
7849
7850         if (!fullPageWrites)
7851         {
7852                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7853                 Insert->fullPageWrites = false;
7854                 LWLockRelease(WALInsertLock);
7855         }
7856         END_CRIT_SECTION();
7857 }
7858
7859 /*
7860  * Check that it's OK to switch to new timeline during recovery.
7861  *
7862  * 'lsn' is the address of the shutdown checkpoint record we're about to
7863  * replay. (Currently, timeline can only change at a shutdown checkpoint).
7864  */
7865 static void
7866 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
7867 {
7868         /* Check that the record agrees on what the current (old) timeline is */
7869         if (prevTLI != ThisTimeLineID)
7870                 ereport(PANIC,
7871                                 (errmsg("unexpected prev timeline ID %u (current timeline ID %u) in checkpoint record",
7872                                                 prevTLI, ThisTimeLineID)));
7873
7874         /*
7875          * The new timeline better be in the list of timelines we expect to see,
7876          * according to the timeline history. It should also not decrease.
7877          */
7878         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
7879                 ereport(PANIC,
7880                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
7881                                  newTLI, ThisTimeLineID)));
7882
7883         /*
7884          * If we have not yet reached min recovery point, and we're about to
7885          * switch to a timeline greater than the timeline of the min recovery
7886          * point: trouble. After switching to the new timeline, we could not
7887          * possibly visit the min recovery point on the correct timeline anymore.
7888          * This can happen if there is a newer timeline in the archive that
7889          * branched before the timeline the min recovery point is on, and you
7890          * attempt to do PITR to the new timeline.
7891          */
7892         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
7893                 lsn < minRecoveryPoint &&
7894                 newTLI > minRecoveryPointTLI)
7895                 ereport(PANIC,
7896                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
7897                                                 newTLI,
7898                                                 (uint32) (minRecoveryPoint >> 32),
7899                                                 (uint32) minRecoveryPoint,
7900                                                 minRecoveryPointTLI)));
7901
7902         /* Looks good */
7903 }
7904
7905 /*
7906  * XLOG resource manager's routines
7907  *
7908  * Definitions of info values are in include/catalog/pg_control.h, though
7909  * not all record types are related to control file updates.
7910  */
7911 void
7912 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
7913 {
7914         uint8           info = record->xl_info & ~XLR_INFO_MASK;
7915
7916         /* Backup blocks are not used by XLOG rmgr */
7917         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
7918
7919         if (info == XLOG_NEXTOID)
7920         {
7921                 Oid                     nextOid;
7922
7923                 /*
7924                  * We used to try to take the maximum of ShmemVariableCache->nextOid
7925                  * and the recorded nextOid, but that fails if the OID counter wraps
7926                  * around.      Since no OID allocation should be happening during replay
7927                  * anyway, better to just believe the record exactly.  We still take
7928                  * OidGenLock while setting the variable, just in case.
7929                  */
7930                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
7931                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
7932                 ShmemVariableCache->nextOid = nextOid;
7933                 ShmemVariableCache->oidCount = 0;
7934                 LWLockRelease(OidGenLock);
7935         }
7936         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
7937         {
7938                 CheckPoint      checkPoint;
7939
7940                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7941                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
7942                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7943                 ShmemVariableCache->nextXid = checkPoint.nextXid;
7944                 LWLockRelease(XidGenLock);
7945                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
7946                 ShmemVariableCache->nextOid = checkPoint.nextOid;
7947                 ShmemVariableCache->oidCount = 0;
7948                 LWLockRelease(OidGenLock);
7949                 MultiXactSetNextMXact(checkPoint.nextMulti,
7950                                                           checkPoint.nextMultiOffset);
7951                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
7952                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
7953
7954                 /*
7955                  * If we see a shutdown checkpoint while waiting for an end-of-backup
7956                  * record, the backup was canceled and the end-of-backup record will
7957                  * never arrive.
7958                  */
7959                 if (ArchiveRecoveryRequested &&
7960                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
7961                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
7962                         ereport(PANIC,
7963                         (errmsg("online backup was canceled, recovery cannot continue")));
7964
7965                 /*
7966                  * If we see a shutdown checkpoint, we know that nothing was running
7967                  * on the master at this point. So fake-up an empty running-xacts
7968                  * record and use that here and now. Recover additional standby state
7969                  * for prepared transactions.
7970                  */
7971                 if (standbyState >= STANDBY_INITIALIZED)
7972                 {
7973                         TransactionId *xids;
7974                         int                     nxids;
7975                         TransactionId oldestActiveXID;
7976                         TransactionId latestCompletedXid;
7977                         RunningTransactionsData running;
7978
7979                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7980
7981                         /*
7982                          * Construct a RunningTransactions snapshot representing a shut
7983                          * down server, with only prepared transactions still alive. We're
7984                          * never overflowed at this point because all subxids are listed
7985                          * with their parent prepared transactions.
7986                          */
7987                         running.xcnt = nxids;
7988                         running.subxcnt = 0;
7989                         running.subxid_overflow = false;
7990                         running.nextXid = checkPoint.nextXid;
7991                         running.oldestRunningXid = oldestActiveXID;
7992                         latestCompletedXid = checkPoint.nextXid;
7993                         TransactionIdRetreat(latestCompletedXid);
7994                         Assert(TransactionIdIsNormal(latestCompletedXid));
7995                         running.latestCompletedXid = latestCompletedXid;
7996                         running.xids = xids;
7997
7998                         ProcArrayApplyRecoveryInfo(&running);
7999
8000                         StandbyRecoverPreparedTransactions(true);
8001                 }
8002
8003                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8004                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8005                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8006
8007                 /* Update shared-memory copy of checkpoint XID/epoch */
8008                 {
8009                         /* use volatile pointer to prevent code rearrangement */
8010                         volatile XLogCtlData *xlogctl = XLogCtl;
8011
8012                         SpinLockAcquire(&xlogctl->info_lck);
8013                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8014                         xlogctl->ckptXid = checkPoint.nextXid;
8015                         SpinLockRelease(&xlogctl->info_lck);
8016                 }
8017
8018                 /*
8019                  * We should've already switched to the new TLI before replaying this
8020                  * record.
8021                  */
8022                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8023                         ereport(PANIC,
8024                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8025                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8026
8027                 RecoveryRestartPoint(&checkPoint);
8028         }
8029         else if (info == XLOG_CHECKPOINT_ONLINE)
8030         {
8031                 CheckPoint      checkPoint;
8032
8033                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8034                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
8035                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8036                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
8037                                                                   checkPoint.nextXid))
8038                         ShmemVariableCache->nextXid = checkPoint.nextXid;
8039                 LWLockRelease(XidGenLock);
8040                 /* ... but still treat OID counter as exact */
8041                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8042                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8043                 ShmemVariableCache->oidCount = 0;
8044                 LWLockRelease(OidGenLock);
8045                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
8046                                                                   checkPoint.nextMultiOffset);
8047                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
8048                                                                   checkPoint.oldestXid))
8049                         SetTransactionIdLimit(checkPoint.oldestXid,
8050                                                                   checkPoint.oldestXidDB);
8051                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
8052                                                            checkPoint.oldestMultiDB);
8053
8054                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8055                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8056                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8057
8058                 /* Update shared-memory copy of checkpoint XID/epoch */
8059                 {
8060                         /* use volatile pointer to prevent code rearrangement */
8061                         volatile XLogCtlData *xlogctl = XLogCtl;
8062
8063                         SpinLockAcquire(&xlogctl->info_lck);
8064                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8065                         xlogctl->ckptXid = checkPoint.nextXid;
8066                         SpinLockRelease(&xlogctl->info_lck);
8067                 }
8068
8069                 /* TLI should not change in an on-line checkpoint */
8070                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8071                         ereport(PANIC,
8072                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8073                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8074
8075                 RecoveryRestartPoint(&checkPoint);
8076         }
8077         else if (info == XLOG_END_OF_RECOVERY)
8078         {
8079                 xl_end_of_recovery xlrec;
8080
8081                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
8082
8083                 /*
8084                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
8085                  * but this case is rarer and harder to test, so the benefit doesn't
8086                  * outweigh the potential extra cost of maintenance.
8087                  */
8088
8089                 /*
8090                  * We should've already switched to the new TLI before replaying this
8091                  * record.
8092                  */
8093                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
8094                         ereport(PANIC,
8095                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8096                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
8097         }
8098         else if (info == XLOG_NOOP)
8099         {
8100                 /* nothing to do here */
8101         }
8102         else if (info == XLOG_SWITCH)
8103         {
8104                 /* nothing to do here */
8105         }
8106         else if (info == XLOG_RESTORE_POINT)
8107         {
8108                 /* nothing to do here */
8109         }
8110         else if (info == XLOG_HINT)
8111         {
8112                 char       *data;
8113                 BkpBlock        bkpb;
8114
8115                 /*
8116                  * Hint bit records contain a backup block stored "inline" in the
8117                  * normal data since the locking when writing hint records isn't
8118                  * sufficient to use the normal backup block mechanism, which assumes
8119                  * exclusive lock on the buffer supplied.
8120                  *
8121                  * Since the only change in these backup block are hint bits, there
8122                  * are no recovery conflicts generated.
8123                  *
8124                  * This also means there is no corresponding API call for this, so an
8125                  * smgr implementation has no need to implement anything. Which means
8126                  * nothing is needed in md.c etc
8127                  */
8128                 data = XLogRecGetData(record);
8129                 memcpy(&bkpb, data, sizeof(BkpBlock));
8130                 data += sizeof(BkpBlock);
8131
8132                 RestoreBackupBlockContents(lsn, bkpb, data, false, false);
8133         }
8134         else if (info == XLOG_BACKUP_END)
8135         {
8136                 XLogRecPtr      startpoint;
8137
8138                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
8139
8140                 if (ControlFile->backupStartPoint == startpoint)
8141                 {
8142                         /*
8143                          * We have reached the end of base backup, the point where
8144                          * pg_stop_backup() was done. The data on disk is now consistent.
8145                          * Reset backupStartPoint, and update minRecoveryPoint to make
8146                          * sure we don't allow starting up at an earlier point even if
8147                          * recovery is stopped and restarted soon after this.
8148                          */
8149                         elog(DEBUG1, "end of backup reached");
8150
8151                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8152
8153                         if (ControlFile->minRecoveryPoint < lsn)
8154                         {
8155                                 ControlFile->minRecoveryPoint = lsn;
8156                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8157                         }
8158                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
8159                         ControlFile->backupEndRequired = false;
8160                         UpdateControlFile();
8161
8162                         LWLockRelease(ControlFileLock);
8163                 }
8164         }
8165         else if (info == XLOG_PARAMETER_CHANGE)
8166         {
8167                 xl_parameter_change xlrec;
8168
8169                 /* Update our copy of the parameters in pg_control */
8170                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8171
8172                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8173                 ControlFile->MaxConnections = xlrec.MaxConnections;
8174                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8175                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8176                 ControlFile->wal_level = xlrec.wal_level;
8177
8178                 /*
8179                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
8180                  * recover back up to this point before allowing hot standby again.
8181                  * This is particularly important if wal_level was set to 'archive'
8182                  * before, and is now 'hot_standby', to ensure you don't run queries
8183                  * against the WAL preceding the wal_level change. Same applies to
8184                  * decreasing max_* settings.
8185                  */
8186                 minRecoveryPoint = ControlFile->minRecoveryPoint;
8187                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
8188                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
8189                 {
8190                         ControlFile->minRecoveryPoint = lsn;
8191                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8192                 }
8193
8194                 UpdateControlFile();
8195                 LWLockRelease(ControlFileLock);
8196
8197                 /* Check to see if any changes to max_connections give problems */
8198                 CheckRequiredParameterValues();
8199         }
8200         else if (info == XLOG_FPW_CHANGE)
8201         {
8202                 /* use volatile pointer to prevent code rearrangement */
8203                 volatile XLogCtlData *xlogctl = XLogCtl;
8204                 bool            fpw;
8205
8206                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8207
8208                 /*
8209                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
8210                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
8211                  * full_page_writes has been disabled during online backup.
8212                  */
8213                 if (!fpw)
8214                 {
8215                         SpinLockAcquire(&xlogctl->info_lck);
8216                         if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
8217                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
8218                         SpinLockRelease(&xlogctl->info_lck);
8219                 }
8220
8221                 /* Keep track of full_page_writes */
8222                 lastFullPageWrites = fpw;
8223         }
8224 }
8225
8226 #ifdef WAL_DEBUG
8227
8228 static void
8229 xlog_outrec(StringInfo buf, XLogRecord *record)
8230 {
8231         int                     i;
8232
8233         appendStringInfo(buf, "prev %X/%X; xid %u",
8234                                          (uint32) (record->xl_prev >> 32),
8235                                          (uint32) record->xl_prev,
8236                                          record->xl_xid);
8237
8238         appendStringInfo(buf, "; len %u",
8239                                          record->xl_len);
8240
8241         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
8242         {
8243                 if (record->xl_info & XLR_BKP_BLOCK(i))
8244                         appendStringInfo(buf, "; bkpb%d", i);
8245         }
8246
8247         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
8248 }
8249 #endif   /* WAL_DEBUG */
8250
8251
8252 /*
8253  * Return the (possible) sync flag used for opening a file, depending on the
8254  * value of the GUC wal_sync_method.
8255  */
8256 static int
8257 get_sync_bit(int method)
8258 {
8259         int                     o_direct_flag = 0;
8260
8261         /* If fsync is disabled, never open in sync mode */
8262         if (!enableFsync)
8263                 return 0;
8264
8265         /*
8266          * Optimize writes by bypassing kernel cache with O_DIRECT when using
8267          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
8268          * disabled, otherwise the archive command or walsender process will read
8269          * the WAL soon after writing it, which is guaranteed to cause a physical
8270          * read if we bypassed the kernel cache. We also skip the
8271          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
8272          * reason.
8273          *
8274          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
8275          * written by walreceiver is normally read by the startup process soon
8276          * after its written. Also, walreceiver performs unaligned writes, which
8277          * don't work with O_DIRECT, so it is required for correctness too.
8278          */
8279         if (!XLogIsNeeded() && !AmWalReceiverProcess())
8280                 o_direct_flag = PG_O_DIRECT;
8281
8282         switch (method)
8283         {
8284                         /*
8285                          * enum values for all sync options are defined even if they are
8286                          * not supported on the current platform.  But if not, they are
8287                          * not included in the enum option array, and therefore will never
8288                          * be seen here.
8289                          */
8290                 case SYNC_METHOD_FSYNC:
8291                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8292                 case SYNC_METHOD_FDATASYNC:
8293                         return 0;
8294 #ifdef OPEN_SYNC_FLAG
8295                 case SYNC_METHOD_OPEN:
8296                         return OPEN_SYNC_FLAG | o_direct_flag;
8297 #endif
8298 #ifdef OPEN_DATASYNC_FLAG
8299                 case SYNC_METHOD_OPEN_DSYNC:
8300                         return OPEN_DATASYNC_FLAG | o_direct_flag;
8301 #endif
8302                 default:
8303                         /* can't happen (unless we are out of sync with option array) */
8304                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
8305                         return 0;                       /* silence warning */
8306         }
8307 }
8308
8309 /*
8310  * GUC support
8311  */
8312 void
8313 assign_xlog_sync_method(int new_sync_method, void *extra)
8314 {
8315         if (sync_method != new_sync_method)
8316         {
8317                 /*
8318                  * To ensure that no blocks escape unsynced, force an fsync on the
8319                  * currently open log segment (if any).  Also, if the open flag is
8320                  * changing, close the log file so it will be reopened (with new flag
8321                  * bit) at next use.
8322                  */
8323                 if (openLogFile >= 0)
8324                 {
8325                         if (pg_fsync(openLogFile) != 0)
8326                                 ereport(PANIC,
8327                                                 (errcode_for_file_access(),
8328                                                  errmsg("could not fsync log segment %s: %m",
8329                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
8330                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
8331                                 XLogFileClose();
8332                 }
8333         }
8334 }
8335
8336
8337 /*
8338  * Issue appropriate kind of fsync (if any) for an XLOG output file.
8339  *
8340  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
8341  * 'log' and 'seg' are for error reporting purposes.
8342  */
8343 void
8344 issue_xlog_fsync(int fd, XLogSegNo segno)
8345 {
8346         switch (sync_method)
8347         {
8348                 case SYNC_METHOD_FSYNC:
8349                         if (pg_fsync_no_writethrough(fd) != 0)
8350                                 ereport(PANIC,
8351                                                 (errcode_for_file_access(),
8352                                                  errmsg("could not fsync log file %s: %m",
8353                                                                 XLogFileNameP(ThisTimeLineID, segno))));
8354                         break;
8355 #ifdef HAVE_FSYNC_WRITETHROUGH
8356                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8357                         if (pg_fsync_writethrough(fd) != 0)
8358                                 ereport(PANIC,
8359                                                 (errcode_for_file_access(),
8360                                           errmsg("could not fsync write-through log file %s: %m",
8361                                                          XLogFileNameP(ThisTimeLineID, segno))));
8362                         break;
8363 #endif
8364 #ifdef HAVE_FDATASYNC
8365                 case SYNC_METHOD_FDATASYNC:
8366                         if (pg_fdatasync(fd) != 0)
8367                                 ereport(PANIC,
8368                                                 (errcode_for_file_access(),
8369                                                  errmsg("could not fdatasync log file %s: %m",
8370                                                                 XLogFileNameP(ThisTimeLineID, segno))));
8371                         break;
8372 #endif
8373                 case SYNC_METHOD_OPEN:
8374                 case SYNC_METHOD_OPEN_DSYNC:
8375                         /* write synced it already */
8376                         break;
8377                 default:
8378                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
8379                         break;
8380         }
8381 }
8382
8383 /*
8384  * Return the filename of given log segment, as a palloc'd string.
8385  */
8386 char *
8387 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
8388 {
8389         char       *result = palloc(MAXFNAMELEN);
8390
8391         XLogFileName(result, tli, segno);
8392         return result;
8393 }
8394
8395 /*
8396  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
8397  * function. It creates the necessary starting checkpoint and constructs the
8398  * backup label file.
8399  *
8400  * There are two kind of backups: exclusive and non-exclusive. An exclusive
8401  * backup is started with pg_start_backup(), and there can be only one active
8402  * at a time. The backup label file of an exclusive backup is written to
8403  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
8404  *
8405  * A non-exclusive backup is used for the streaming base backups (see
8406  * src/backend/replication/basebackup.c). The difference to exclusive backups
8407  * is that the backup label file is not written to disk. Instead, its would-be
8408  * contents are returned in *labelfile, and the caller is responsible for
8409  * including it in the backup archive as 'backup_label'. There can be many
8410  * non-exclusive backups active at the same time, and they don't conflict
8411  * with an exclusive backup either.
8412  *
8413  * Returns the minimum WAL position that must be present to restore from this
8414  * backup, and the corresponding timeline ID in *starttli_p.
8415  *
8416  * Every successfully started non-exclusive backup must be stopped by calling
8417  * do_pg_stop_backup() or do_pg_abort_backup().
8418  */
8419 XLogRecPtr
8420 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
8421                                    char **labelfile)
8422 {
8423         bool            exclusive = (labelfile == NULL);
8424         bool            backup_started_in_recovery = false;
8425         XLogRecPtr      checkpointloc;
8426         XLogRecPtr      startpoint;
8427         TimeLineID      starttli;
8428         pg_time_t       stamp_time;
8429         char            strfbuf[128];
8430         char            xlogfilename[MAXFNAMELEN];
8431         XLogSegNo       _logSegNo;
8432         struct stat stat_buf;
8433         FILE       *fp;
8434         StringInfoData labelfbuf;
8435
8436         backup_started_in_recovery = RecoveryInProgress();
8437
8438         if (!superuser() && !has_rolreplication(GetUserId()))
8439                 ereport(ERROR,
8440                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8441                    errmsg("must be superuser or replication role to run a backup")));
8442
8443         /*
8444          * Currently only non-exclusive backup can be taken during recovery.
8445          */
8446         if (backup_started_in_recovery && exclusive)
8447                 ereport(ERROR,
8448                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8449                                  errmsg("recovery is in progress"),
8450                                  errhint("WAL control functions cannot be executed during recovery.")));
8451
8452         /*
8453          * During recovery, we don't need to check WAL level. Because, if WAL
8454          * level is not sufficient, it's impossible to get here during recovery.
8455          */
8456         if (!backup_started_in_recovery && !XLogIsNeeded())
8457                 ereport(ERROR,
8458                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8459                           errmsg("WAL level not sufficient for making an online backup"),
8460                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8461
8462         if (strlen(backupidstr) > MAXPGPATH)
8463                 ereport(ERROR,
8464                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8465                                  errmsg("backup label too long (max %d bytes)",
8466                                                 MAXPGPATH)));
8467
8468         /*
8469          * Mark backup active in shared memory.  We must do full-page WAL writes
8470          * during an on-line backup even if not doing so at other times, because
8471          * it's quite possible for the backup dump to obtain a "torn" (partially
8472          * written) copy of a database page if it reads the page concurrently with
8473          * our write to the same page.  This can be fixed as long as the first
8474          * write to the page in the WAL sequence is a full-page write. Hence, we
8475          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
8476          * are no dirty pages in shared memory that might get dumped while the
8477          * backup is in progress without having a corresponding WAL record.  (Once
8478          * the backup is complete, we need not force full-page writes anymore,
8479          * since we expect that any pages not modified during the backup interval
8480          * must have been correctly captured by the backup.)
8481          *
8482          * Note that forcePageWrites has no effect during an online backup from
8483          * the standby.
8484          *
8485          * We must hold WALInsertLock to change the value of forcePageWrites, to
8486          * ensure adequate interlocking against XLogInsert().
8487          */
8488         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8489         if (exclusive)
8490         {
8491                 if (XLogCtl->Insert.exclusiveBackup)
8492                 {
8493                         LWLockRelease(WALInsertLock);
8494                         ereport(ERROR,
8495                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8496                                          errmsg("a backup is already in progress"),
8497                                          errhint("Run pg_stop_backup() and try again.")));
8498                 }
8499                 XLogCtl->Insert.exclusiveBackup = true;
8500         }
8501         else
8502                 XLogCtl->Insert.nonExclusiveBackups++;
8503         XLogCtl->Insert.forcePageWrites = true;
8504         LWLockRelease(WALInsertLock);
8505
8506         /* Ensure we release forcePageWrites if fail below */
8507         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
8508         {
8509                 bool            gotUniqueStartpoint = false;
8510
8511                 /*
8512                  * Force an XLOG file switch before the checkpoint, to ensure that the
8513                  * WAL segment the checkpoint is written to doesn't contain pages with
8514                  * old timeline IDs.  That would otherwise happen if you called
8515                  * pg_start_backup() right after restoring from a PITR archive: the
8516                  * first WAL segment containing the startup checkpoint has pages in
8517                  * the beginning with the old timeline ID.      That can cause trouble at
8518                  * recovery: we won't have a history file covering the old timeline if
8519                  * pg_xlog directory was not included in the base backup and the WAL
8520                  * archive was cleared too before starting the backup.
8521                  *
8522                  * This also ensures that we have emitted a WAL page header that has
8523                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
8524                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
8525                  * compress out removable backup blocks, it won't remove any that
8526                  * occur after this point.
8527                  *
8528                  * During recovery, we skip forcing XLOG file switch, which means that
8529                  * the backup taken during recovery is not available for the special
8530                  * recovery case described above.
8531                  */
8532                 if (!backup_started_in_recovery)
8533                         RequestXLogSwitch();
8534
8535                 do
8536                 {
8537                         bool            checkpointfpw;
8538
8539                         /*
8540                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
8541                          * page problems, this guarantees that two successive backup runs
8542                          * will have different checkpoint positions and hence different
8543                          * history file names, even if nothing happened in between.
8544                          *
8545                          * During recovery, establish a restartpoint if possible. We use
8546                          * the last restartpoint as the backup starting checkpoint. This
8547                          * means that two successive backup runs can have same checkpoint
8548                          * positions.
8549                          *
8550                          * Since the fact that we are executing do_pg_start_backup()
8551                          * during recovery means that checkpointer is running, we can use
8552                          * RequestCheckpoint() to establish a restartpoint.
8553                          *
8554                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
8555                          * passing fast = true).  Otherwise this can take awhile.
8556                          */
8557                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
8558                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
8559
8560                         /*
8561                          * Now we need to fetch the checkpoint record location, and also
8562                          * its REDO pointer.  The oldest point in WAL that would be needed
8563                          * to restore starting from the checkpoint is precisely the REDO
8564                          * pointer.
8565                          */
8566                         LWLockAcquire(ControlFileLock, LW_SHARED);
8567                         checkpointloc = ControlFile->checkPoint;
8568                         startpoint = ControlFile->checkPointCopy.redo;
8569                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
8570                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
8571                         LWLockRelease(ControlFileLock);
8572
8573                         if (backup_started_in_recovery)
8574                         {
8575                                 /* use volatile pointer to prevent code rearrangement */
8576                                 volatile XLogCtlData *xlogctl = XLogCtl;
8577                                 XLogRecPtr      recptr;
8578
8579                                 /*
8580                                  * Check to see if all WAL replayed during online backup
8581                                  * (i.e., since last restartpoint used as backup starting
8582                                  * checkpoint) contain full-page writes.
8583                                  */
8584                                 SpinLockAcquire(&xlogctl->info_lck);
8585                                 recptr = xlogctl->lastFpwDisableRecPtr;
8586                                 SpinLockRelease(&xlogctl->info_lck);
8587
8588                                 if (!checkpointfpw || startpoint <= recptr)
8589                                         ereport(ERROR,
8590                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8591                                                    errmsg("WAL generated with full_page_writes=off was replayed "
8592                                                                   "since last restartpoint"),
8593                                                    errhint("This means that the backup being taken on the standby "
8594                                                                    "is corrupt and should not be used. "
8595                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
8596                                                                    "and then try an online backup again.")));
8597
8598                                 /*
8599                                  * During recovery, since we don't use the end-of-backup WAL
8600                                  * record and don't write the backup history file, the
8601                                  * starting WAL location doesn't need to be unique. This means
8602                                  * that two base backups started at the same time might use
8603                                  * the same checkpoint as starting locations.
8604                                  */
8605                                 gotUniqueStartpoint = true;
8606                         }
8607
8608                         /*
8609                          * If two base backups are started at the same time (in WAL sender
8610                          * processes), we need to make sure that they use different
8611                          * checkpoints as starting locations, because we use the starting
8612                          * WAL location as a unique identifier for the base backup in the
8613                          * end-of-backup WAL record and when we write the backup history
8614                          * file. Perhaps it would be better generate a separate unique ID
8615                          * for each backup instead of forcing another checkpoint, but
8616                          * taking a checkpoint right after another is not that expensive
8617                          * either because only few buffers have been dirtied yet.
8618                          */
8619                         LWLockAcquire(WALInsertLock, LW_SHARED);
8620                         if (XLogCtl->Insert.lastBackupStart < startpoint)
8621                         {
8622                                 XLogCtl->Insert.lastBackupStart = startpoint;
8623                                 gotUniqueStartpoint = true;
8624                         }
8625                         LWLockRelease(WALInsertLock);
8626                 } while (!gotUniqueStartpoint);
8627
8628                 XLByteToSeg(startpoint, _logSegNo);
8629                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
8630
8631                 /*
8632                  * Construct backup label file
8633                  */
8634                 initStringInfo(&labelfbuf);
8635
8636                 /* Use the log timezone here, not the session timezone */
8637                 stamp_time = (pg_time_t) time(NULL);
8638                 pg_strftime(strfbuf, sizeof(strfbuf),
8639                                         "%Y-%m-%d %H:%M:%S %Z",
8640                                         pg_localtime(&stamp_time, log_timezone));
8641                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
8642                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
8643                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
8644                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
8645                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
8646                                                  exclusive ? "pg_start_backup" : "streamed");
8647                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
8648                                                  backup_started_in_recovery ? "standby" : "master");
8649                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
8650                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
8651
8652                 /*
8653                  * Okay, write the file, or return its contents to caller.
8654                  */
8655                 if (exclusive)
8656                 {
8657                         /*
8658                          * Check for existing backup label --- implies a backup is already
8659                          * running.  (XXX given that we checked exclusiveBackup above,
8660                          * maybe it would be OK to just unlink any such label file?)
8661                          */
8662                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
8663                         {
8664                                 if (errno != ENOENT)
8665                                         ereport(ERROR,
8666                                                         (errcode_for_file_access(),
8667                                                          errmsg("could not stat file \"%s\": %m",
8668                                                                         BACKUP_LABEL_FILE)));
8669                         }
8670                         else
8671                                 ereport(ERROR,
8672                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8673                                                  errmsg("a backup is already in progress"),
8674                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
8675                                                                  BACKUP_LABEL_FILE)));
8676
8677                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
8678
8679                         if (!fp)
8680                                 ereport(ERROR,
8681                                                 (errcode_for_file_access(),
8682                                                  errmsg("could not create file \"%s\": %m",
8683                                                                 BACKUP_LABEL_FILE)));
8684                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
8685                                 fflush(fp) != 0 ||
8686                                 pg_fsync(fileno(fp)) != 0 ||
8687                                 ferror(fp) ||
8688                                 FreeFile(fp))
8689                                 ereport(ERROR,
8690                                                 (errcode_for_file_access(),
8691                                                  errmsg("could not write file \"%s\": %m",
8692                                                                 BACKUP_LABEL_FILE)));
8693                         pfree(labelfbuf.data);
8694                 }
8695                 else
8696                         *labelfile = labelfbuf.data;
8697         }
8698         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
8699
8700         /*
8701          * We're done.  As a convenience, return the starting WAL location.
8702          */
8703         if (starttli_p)
8704                 *starttli_p = starttli;
8705         return startpoint;
8706 }
8707
8708 /* Error cleanup callback for pg_start_backup */
8709 static void
8710 pg_start_backup_callback(int code, Datum arg)
8711 {
8712         bool            exclusive = DatumGetBool(arg);
8713
8714         /* Update backup counters and forcePageWrites on failure */
8715         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8716         if (exclusive)
8717         {
8718                 Assert(XLogCtl->Insert.exclusiveBackup);
8719                 XLogCtl->Insert.exclusiveBackup = false;
8720         }
8721         else
8722         {
8723                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
8724                 XLogCtl->Insert.nonExclusiveBackups--;
8725         }
8726
8727         if (!XLogCtl->Insert.exclusiveBackup &&
8728                 XLogCtl->Insert.nonExclusiveBackups == 0)
8729         {
8730                 XLogCtl->Insert.forcePageWrites = false;
8731         }
8732         LWLockRelease(WALInsertLock);
8733 }
8734
8735 /*
8736  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
8737  * function.
8738
8739  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
8740  * the non-exclusive backup specified by 'labelfile'.
8741  *
8742  * Returns the last WAL position that must be present to restore from this
8743  * backup, and the corresponding timeline ID in *stoptli_p.
8744  */
8745 XLogRecPtr
8746 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
8747 {
8748         bool            exclusive = (labelfile == NULL);
8749         bool            backup_started_in_recovery = false;
8750         XLogRecPtr      startpoint;
8751         XLogRecPtr      stoppoint;
8752         TimeLineID      stoptli;
8753         XLogRecData rdata;
8754         pg_time_t       stamp_time;
8755         char            strfbuf[128];
8756         char            histfilepath[MAXPGPATH];
8757         char            startxlogfilename[MAXFNAMELEN];
8758         char            stopxlogfilename[MAXFNAMELEN];
8759         char            lastxlogfilename[MAXFNAMELEN];
8760         char            histfilename[MAXFNAMELEN];
8761         char            backupfrom[20];
8762         XLogSegNo       _logSegNo;
8763         FILE       *lfp;
8764         FILE       *fp;
8765         char            ch;
8766         int                     seconds_before_warning;
8767         int                     waits = 0;
8768         bool            reported_waiting = false;
8769         char       *remaining;
8770         char       *ptr;
8771         uint32          hi,
8772                                 lo;
8773
8774         backup_started_in_recovery = RecoveryInProgress();
8775
8776         if (!superuser() && !has_rolreplication(GetUserId()))
8777                 ereport(ERROR,
8778                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8779                  (errmsg("must be superuser or replication role to run a backup"))));
8780
8781         /*
8782          * Currently only non-exclusive backup can be taken during recovery.
8783          */
8784         if (backup_started_in_recovery && exclusive)
8785                 ereport(ERROR,
8786                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8787                                  errmsg("recovery is in progress"),
8788                                  errhint("WAL control functions cannot be executed during recovery.")));
8789
8790         /*
8791          * During recovery, we don't need to check WAL level. Because, if WAL
8792          * level is not sufficient, it's impossible to get here during recovery.
8793          */
8794         if (!backup_started_in_recovery && !XLogIsNeeded())
8795                 ereport(ERROR,
8796                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8797                           errmsg("WAL level not sufficient for making an online backup"),
8798                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8799
8800         /*
8801          * OK to update backup counters and forcePageWrites
8802          */
8803         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8804         if (exclusive)
8805                 XLogCtl->Insert.exclusiveBackup = false;
8806         else
8807         {
8808                 /*
8809                  * The user-visible pg_start/stop_backup() functions that operate on
8810                  * exclusive backups can be called at any time, but for non-exclusive
8811                  * backups, it is expected that each do_pg_start_backup() call is
8812                  * matched by exactly one do_pg_stop_backup() call.
8813                  */
8814                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
8815                 XLogCtl->Insert.nonExclusiveBackups--;
8816         }
8817
8818         if (!XLogCtl->Insert.exclusiveBackup &&
8819                 XLogCtl->Insert.nonExclusiveBackups == 0)
8820         {
8821                 XLogCtl->Insert.forcePageWrites = false;
8822         }
8823         LWLockRelease(WALInsertLock);
8824
8825         if (exclusive)
8826         {
8827                 /*
8828                  * Read the existing label file into memory.
8829                  */
8830                 struct stat statbuf;
8831                 int                     r;
8832
8833                 if (stat(BACKUP_LABEL_FILE, &statbuf))
8834                 {
8835                         if (errno != ENOENT)
8836                                 ereport(ERROR,
8837                                                 (errcode_for_file_access(),
8838                                                  errmsg("could not stat file \"%s\": %m",
8839                                                                 BACKUP_LABEL_FILE)));
8840                         ereport(ERROR,
8841                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8842                                          errmsg("a backup is not in progress")));
8843                 }
8844
8845                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
8846                 if (!lfp)
8847                 {
8848                         ereport(ERROR,
8849                                         (errcode_for_file_access(),
8850                                          errmsg("could not read file \"%s\": %m",
8851                                                         BACKUP_LABEL_FILE)));
8852                 }
8853                 labelfile = palloc(statbuf.st_size + 1);
8854                 r = fread(labelfile, statbuf.st_size, 1, lfp);
8855                 labelfile[statbuf.st_size] = '\0';
8856
8857                 /*
8858                  * Close and remove the backup label file
8859                  */
8860                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
8861                         ereport(ERROR,
8862                                         (errcode_for_file_access(),
8863                                          errmsg("could not read file \"%s\": %m",
8864                                                         BACKUP_LABEL_FILE)));
8865                 if (unlink(BACKUP_LABEL_FILE) != 0)
8866                         ereport(ERROR,
8867                                         (errcode_for_file_access(),
8868                                          errmsg("could not remove file \"%s\": %m",
8869                                                         BACKUP_LABEL_FILE)));
8870         }
8871
8872         /*
8873          * Read and parse the START WAL LOCATION line (this code is pretty crude,
8874          * but we are not expecting any variability in the file format).
8875          */
8876         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
8877                            &hi, &lo, startxlogfilename,
8878                            &ch) != 4 || ch != '\n')
8879                 ereport(ERROR,
8880                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8881                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
8882         startpoint = ((uint64) hi) << 32 | lo;
8883         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
8884
8885         /*
8886          * Parse the BACKUP FROM line. If we are taking an online backup from the
8887          * standby, we confirm that the standby has not been promoted during the
8888          * backup.
8889          */
8890         ptr = strstr(remaining, "BACKUP FROM:");
8891         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
8892                 ereport(ERROR,
8893                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8894                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
8895         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
8896                 ereport(ERROR,
8897                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8898                                  errmsg("the standby was promoted during online backup"),
8899                                  errhint("This means that the backup being taken is corrupt "
8900                                                  "and should not be used. "
8901                                                  "Try taking another online backup.")));
8902
8903         /*
8904          * During recovery, we don't write an end-of-backup record. We assume that
8905          * pg_control was backed up last and its minimum recovery point can be
8906          * available as the backup end location. Since we don't have an
8907          * end-of-backup record, we use the pg_control value to check whether
8908          * we've reached the end of backup when starting recovery from this
8909          * backup. We have no way of checking if pg_control wasn't backed up last
8910          * however.
8911          *
8912          * We don't force a switch to new WAL file and wait for all the required
8913          * files to be archived. This is okay if we use the backup to start the
8914          * standby. But, if it's for an archive recovery, to ensure all the
8915          * required files are available, a user should wait for them to be
8916          * archived, or include them into the backup.
8917          *
8918          * We return the current minimum recovery point as the backup end
8919          * location. Note that it can be greater than the exact backup end
8920          * location if the minimum recovery point is updated after the backup of
8921          * pg_control. This is harmless for current uses.
8922          *
8923          * XXX currently a backup history file is for informational and debug
8924          * purposes only. It's not essential for an online backup. Furthermore,
8925          * even if it's created, it will not be archived during recovery because
8926          * an archiver is not invoked. So it doesn't seem worthwhile to write a
8927          * backup history file during recovery.
8928          */
8929         if (backup_started_in_recovery)
8930         {
8931                 /* use volatile pointer to prevent code rearrangement */
8932                 volatile XLogCtlData *xlogctl = XLogCtl;
8933                 XLogRecPtr      recptr;
8934
8935                 /*
8936                  * Check to see if all WAL replayed during online backup contain
8937                  * full-page writes.
8938                  */
8939                 SpinLockAcquire(&xlogctl->info_lck);
8940                 recptr = xlogctl->lastFpwDisableRecPtr;
8941                 SpinLockRelease(&xlogctl->info_lck);
8942
8943                 if (startpoint <= recptr)
8944                         ereport(ERROR,
8945                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8946                            errmsg("WAL generated with full_page_writes=off was replayed "
8947                                           "during online backup"),
8948                          errhint("This means that the backup being taken on the standby "
8949                                          "is corrupt and should not be used. "
8950                                  "Enable full_page_writes and run CHECKPOINT on the master, "
8951                                          "and then try an online backup again.")));
8952
8953
8954                 LWLockAcquire(ControlFileLock, LW_SHARED);
8955                 stoppoint = ControlFile->minRecoveryPoint;
8956                 stoptli = ControlFile->minRecoveryPointTLI;
8957                 LWLockRelease(ControlFileLock);
8958
8959                 if (stoptli_p)
8960                         *stoptli_p = stoptli;
8961                 return stoppoint;
8962         }
8963
8964         /*
8965          * Write the backup-end xlog record
8966          */
8967         rdata.data = (char *) (&startpoint);
8968         rdata.len = sizeof(startpoint);
8969         rdata.buffer = InvalidBuffer;
8970         rdata.next = NULL;
8971         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
8972         stoptli = ThisTimeLineID;
8973
8974         /*
8975          * Force a switch to a new xlog segment file, so that the backup is valid
8976          * as soon as archiver moves out the current segment file.
8977          */
8978         RequestXLogSwitch();
8979
8980         XLByteToPrevSeg(stoppoint, _logSegNo);
8981         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
8982
8983         /* Use the log timezone here, not the session timezone */
8984         stamp_time = (pg_time_t) time(NULL);
8985         pg_strftime(strfbuf, sizeof(strfbuf),
8986                                 "%Y-%m-%d %H:%M:%S %Z",
8987                                 pg_localtime(&stamp_time, log_timezone));
8988
8989         /*
8990          * Write the backup history file
8991          */
8992         XLByteToSeg(startpoint, _logSegNo);
8993         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
8994                                                   (uint32) (startpoint % XLogSegSize));
8995         fp = AllocateFile(histfilepath, "w");
8996         if (!fp)
8997                 ereport(ERROR,
8998                                 (errcode_for_file_access(),
8999                                  errmsg("could not create file \"%s\": %m",
9000                                                 histfilepath)));
9001         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
9002                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
9003         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
9004                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
9005         /* transfer remaining lines from label to history file */
9006         fprintf(fp, "%s", remaining);
9007         fprintf(fp, "STOP TIME: %s\n", strfbuf);
9008         if (fflush(fp) || ferror(fp) || FreeFile(fp))
9009                 ereport(ERROR,
9010                                 (errcode_for_file_access(),
9011                                  errmsg("could not write file \"%s\": %m",
9012                                                 histfilepath)));
9013
9014         /*
9015          * Clean out any no-longer-needed history files.  As a side effect, this
9016          * will post a .ready file for the newly created history file, notifying
9017          * the archiver that history file may be archived immediately.
9018          */
9019         CleanupBackupHistory();
9020
9021         /*
9022          * If archiving is enabled, wait for all the required WAL files to be
9023          * archived before returning. If archiving isn't enabled, the required WAL
9024          * needs to be transported via streaming replication (hopefully with
9025          * wal_keep_segments set high enough), or some more exotic mechanism like
9026          * polling and copying files from pg_xlog with script. We have no
9027          * knowledge of those mechanisms, so it's up to the user to ensure that he
9028          * gets all the required WAL.
9029          *
9030          * We wait until both the last WAL file filled during backup and the
9031          * history file have been archived, and assume that the alphabetic sorting
9032          * property of the WAL files ensures any earlier WAL files are safely
9033          * archived as well.
9034          *
9035          * We wait forever, since archive_command is supposed to work and we
9036          * assume the admin wanted his backup to work completely. If you don't
9037          * wish to wait, you can set statement_timeout.  Also, some notices are
9038          * issued to clue in anyone who might be doing this interactively.
9039          */
9040         if (waitforarchive && XLogArchivingActive())
9041         {
9042                 XLByteToPrevSeg(stoppoint, _logSegNo);
9043                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
9044
9045                 XLByteToSeg(startpoint, _logSegNo);
9046                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
9047                                                           (uint32) (startpoint % XLogSegSize));
9048
9049                 seconds_before_warning = 60;
9050                 waits = 0;
9051
9052                 while (XLogArchiveIsBusy(lastxlogfilename) ||
9053                            XLogArchiveIsBusy(histfilename))
9054                 {
9055                         CHECK_FOR_INTERRUPTS();
9056
9057                         if (!reported_waiting && waits > 5)
9058                         {
9059                                 ereport(NOTICE,
9060                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
9061                                 reported_waiting = true;
9062                         }
9063
9064                         pg_usleep(1000000L);
9065
9066                         if (++waits >= seconds_before_warning)
9067                         {
9068                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
9069                                 ereport(WARNING,
9070                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
9071                                                                 waits),
9072                                                  errhint("Check that your archive_command is executing properly.  "
9073                                                                  "pg_stop_backup can be canceled safely, "
9074                                                                  "but the database backup will not be usable without all the WAL segments.")));
9075                         }
9076                 }
9077
9078                 ereport(NOTICE,
9079                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
9080         }
9081         else if (waitforarchive)
9082                 ereport(NOTICE,
9083                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
9084
9085         /*
9086          * We're done.  As a convenience, return the ending WAL location.
9087          */
9088         if (stoptli_p)
9089                 *stoptli_p = stoptli;
9090         return stoppoint;
9091 }
9092
9093
9094 /*
9095  * do_pg_abort_backup: abort a running backup
9096  *
9097  * This does just the most basic steps of do_pg_stop_backup(), by taking the
9098  * system out of backup mode, thus making it a lot more safe to call from
9099  * an error handler.
9100  *
9101  * NB: This is only for aborting a non-exclusive backup that doesn't write
9102  * backup_label. A backup started with pg_stop_backup() needs to be finished
9103  * with pg_stop_backup().
9104  */
9105 void
9106 do_pg_abort_backup(void)
9107 {
9108         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9109         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9110         XLogCtl->Insert.nonExclusiveBackups--;
9111
9112         if (!XLogCtl->Insert.exclusiveBackup &&
9113                 XLogCtl->Insert.nonExclusiveBackups == 0)
9114         {
9115                 XLogCtl->Insert.forcePageWrites = false;
9116         }
9117         LWLockRelease(WALInsertLock);
9118 }
9119
9120 /*
9121  * Get latest redo apply position.
9122  *
9123  * Exported to allow WALReceiver to read the pointer directly.
9124  */
9125 XLogRecPtr
9126 GetXLogReplayRecPtr(TimeLineID *replayTLI)
9127 {
9128         /* use volatile pointer to prevent code rearrangement */
9129         volatile XLogCtlData *xlogctl = XLogCtl;
9130         XLogRecPtr      recptr;
9131         TimeLineID      tli;
9132
9133         SpinLockAcquire(&xlogctl->info_lck);
9134         recptr = xlogctl->lastReplayedEndRecPtr;
9135         tli = xlogctl->lastReplayedTLI;
9136         SpinLockRelease(&xlogctl->info_lck);
9137
9138         if (replayTLI)
9139                 *replayTLI = tli;
9140         return recptr;
9141 }
9142
9143 /*
9144  * Get latest WAL insert pointer
9145  */
9146 XLogRecPtr
9147 GetXLogInsertRecPtr(void)
9148 {
9149         XLogCtlInsert *Insert = &XLogCtl->Insert;
9150         XLogRecPtr      current_recptr;
9151
9152         LWLockAcquire(WALInsertLock, LW_SHARED);
9153         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
9154         LWLockRelease(WALInsertLock);
9155
9156         return current_recptr;
9157 }
9158
9159 /*
9160  * Get latest WAL write pointer
9161  */
9162 XLogRecPtr
9163 GetXLogWriteRecPtr(void)
9164 {
9165         {
9166                 /* use volatile pointer to prevent code rearrangement */
9167                 volatile XLogCtlData *xlogctl = XLogCtl;
9168
9169                 SpinLockAcquire(&xlogctl->info_lck);
9170                 LogwrtResult = xlogctl->LogwrtResult;
9171                 SpinLockRelease(&xlogctl->info_lck);
9172         }
9173
9174         return LogwrtResult.Write;
9175 }
9176
9177 /*
9178  * Returns the redo pointer of the last checkpoint or restartpoint. This is
9179  * the oldest point in WAL that we still need, if we have to restart recovery.
9180  */
9181 void
9182 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
9183 {
9184         LWLockAcquire(ControlFileLock, LW_SHARED);
9185         *oldrecptr = ControlFile->checkPointCopy.redo;
9186         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
9187         LWLockRelease(ControlFileLock);
9188 }
9189
9190 /*
9191  * read_backup_label: check to see if a backup_label file is present
9192  *
9193  * If we see a backup_label during recovery, we assume that we are recovering
9194  * from a backup dump file, and we therefore roll forward from the checkpoint
9195  * identified by the label file, NOT what pg_control says.      This avoids the
9196  * problem that pg_control might have been archived one or more checkpoints
9197  * later than the start of the dump, and so if we rely on it as the start
9198  * point, we will fail to restore a consistent database state.
9199  *
9200  * Returns TRUE if a backup_label was found (and fills the checkpoint
9201  * location and its REDO location into *checkPointLoc and RedoStartLSN,
9202  * respectively); returns FALSE if not. If this backup_label came from a
9203  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
9204  * was created during recovery, *backupFromStandby is set to TRUE.
9205  */
9206 static bool
9207 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
9208                                   bool *backupFromStandby)
9209 {
9210         char            startxlogfilename[MAXFNAMELEN];
9211         TimeLineID      tli;
9212         FILE       *lfp;
9213         char            ch;
9214         char            backuptype[20];
9215         char            backupfrom[20];
9216         uint32          hi,
9217                                 lo;
9218
9219         *backupEndRequired = false;
9220         *backupFromStandby = false;
9221
9222         /*
9223          * See if label file is present
9224          */
9225         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9226         if (!lfp)
9227         {
9228                 if (errno != ENOENT)
9229                         ereport(FATAL,
9230                                         (errcode_for_file_access(),
9231                                          errmsg("could not read file \"%s\": %m",
9232                                                         BACKUP_LABEL_FILE)));
9233                 return false;                   /* it's not there, all is fine */
9234         }
9235
9236         /*
9237          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
9238          * is pretty crude, but we are not expecting any variability in the file
9239          * format).
9240          */
9241         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
9242                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
9243                 ereport(FATAL,
9244                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9245                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9246         RedoStartLSN = ((uint64) hi) << 32 | lo;
9247         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
9248                            &hi, &lo, &ch) != 3 || ch != '\n')
9249                 ereport(FATAL,
9250                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9251                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9252         *checkPointLoc = ((uint64) hi) << 32 | lo;
9253
9254         /*
9255          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
9256          * from an older backup anyway, but since the information on it is not
9257          * strictly required, don't error out if it's missing for some reason.
9258          */
9259         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
9260         {
9261                 if (strcmp(backuptype, "streamed") == 0)
9262                         *backupEndRequired = true;
9263         }
9264
9265         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
9266         {
9267                 if (strcmp(backupfrom, "standby") == 0)
9268                         *backupFromStandby = true;
9269         }
9270
9271         if (ferror(lfp) || FreeFile(lfp))
9272                 ereport(FATAL,
9273                                 (errcode_for_file_access(),
9274                                  errmsg("could not read file \"%s\": %m",
9275                                                 BACKUP_LABEL_FILE)));
9276
9277         return true;
9278 }
9279
9280 /*
9281  * Error context callback for errors occurring during rm_redo().
9282  */
9283 static void
9284 rm_redo_error_callback(void *arg)
9285 {
9286         XLogRecord *record = (XLogRecord *) arg;
9287         StringInfoData buf;
9288
9289         initStringInfo(&buf);
9290         RmgrTable[record->xl_rmid].rm_desc(&buf,
9291                                                                            record->xl_info,
9292                                                                            XLogRecGetData(record));
9293
9294         /* don't bother emitting empty description */
9295         if (buf.len > 0)
9296                 errcontext("xlog redo %s", buf.data);
9297
9298         pfree(buf.data);
9299 }
9300
9301 /*
9302  * BackupInProgress: check if online backup mode is active
9303  *
9304  * This is done by checking for existence of the "backup_label" file.
9305  */
9306 bool
9307 BackupInProgress(void)
9308 {
9309         struct stat stat_buf;
9310
9311         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
9312 }
9313
9314 /*
9315  * CancelBackup: rename the "backup_label" file to cancel backup mode
9316  *
9317  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
9318  * Note that this will render an online backup in progress useless.
9319  * To correctly finish an online backup, pg_stop_backup must be called.
9320  */
9321 void
9322 CancelBackup(void)
9323 {
9324         struct stat stat_buf;
9325
9326         /* if the file is not there, return */
9327         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
9328                 return;
9329
9330         /* remove leftover file from previously canceled backup if it exists */
9331         unlink(BACKUP_LABEL_OLD);
9332
9333         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
9334         {
9335                 ereport(LOG,
9336                                 (errmsg("online backup mode canceled"),
9337                                  errdetail("\"%s\" was renamed to \"%s\".",
9338                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9339         }
9340         else
9341         {
9342                 ereport(WARNING,
9343                                 (errcode_for_file_access(),
9344                                  errmsg("online backup mode was not canceled"),
9345                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
9346                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9347         }
9348 }
9349
9350 /*
9351  * Read the XLOG page containing RecPtr into readBuf (if not read already).
9352  * Returns number of bytes read, if the page is read successfully, or -1
9353  * in case of errors.  When errors occur, they are ereport'ed, but only
9354  * if they have not been previously reported.
9355  *
9356  * This is responsible for restoring files from archive as needed, as well
9357  * as for waiting for the requested WAL record to arrive in standby mode.
9358  *
9359  * 'emode' specifies the log level used for reporting "file not found" or
9360  * "end of WAL" situations in archive recovery, or in standby mode when a
9361  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
9362  * false in those situations, on higher log levels the ereport() won't
9363  * return.
9364  *
9365  * In standby mode, if after a successful return of XLogPageRead() the
9366  * caller finds the record it's interested in to be broken, it should
9367  * ereport the error with the level determined by
9368  * emode_for_corrupt_record(), and then set lastSourceFailed
9369  * and call XLogPageRead() again with the same arguments. This lets
9370  * XLogPageRead() to try fetching the record from another source, or to
9371  * sleep and retry.
9372  */
9373 static int
9374 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
9375                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
9376 {
9377         XLogPageReadPrivate *private =
9378         (XLogPageReadPrivate *) xlogreader->private_data;
9379         int                     emode = private->emode;
9380         uint32          targetPageOff;
9381         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
9382
9383         XLByteToSeg(targetPagePtr, targetSegNo);
9384         targetPageOff = targetPagePtr % XLogSegSize;
9385
9386         /*
9387          * See if we need to switch to a new segment because the requested record
9388          * is not in the currently open one.
9389          */
9390         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
9391         {
9392                 /*
9393                  * Request a restartpoint if we've replayed too much xlog since the
9394                  * last one.
9395                  */
9396                 if (StandbyModeRequested && bgwriterLaunched)
9397                 {
9398                         if (XLogCheckpointNeeded(readSegNo))
9399                         {
9400                                 (void) GetRedoRecPtr();
9401                                 if (XLogCheckpointNeeded(readSegNo))
9402                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
9403                         }
9404                 }
9405
9406                 close(readFile);
9407                 readFile = -1;
9408                 readSource = 0;
9409         }
9410
9411         XLByteToSeg(targetPagePtr, readSegNo);
9412
9413 retry:
9414         /* See if we need to retrieve more data */
9415         if (readFile < 0 ||
9416                 (readSource == XLOG_FROM_STREAM &&
9417                  receivedUpto < targetPagePtr + reqLen))
9418         {
9419                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
9420                                                                                  private->randAccess,
9421                                                                                  private->fetching_ckpt,
9422                                                                                  targetRecPtr))
9423                 {
9424                         if (readFile >= 0)
9425                                 close(readFile);
9426                         readFile = -1;
9427                         readLen = 0;
9428                         readSource = 0;
9429
9430                         return -1;
9431                 }
9432         }
9433
9434         /*
9435          * At this point, we have the right segment open and if we're streaming we
9436          * know the requested record is in it.
9437          */
9438         Assert(readFile != -1);
9439
9440         /*
9441          * If the current segment is being streamed from master, calculate how
9442          * much of the current page we have received already. We know the
9443          * requested record has been received, but this is for the benefit of
9444          * future calls, to allow quick exit at the top of this function.
9445          */
9446         if (readSource == XLOG_FROM_STREAM)
9447         {
9448                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
9449                         readLen = XLOG_BLCKSZ;
9450                 else
9451                         readLen = receivedUpto % XLogSegSize - targetPageOff;
9452         }
9453         else
9454                 readLen = XLOG_BLCKSZ;
9455
9456         /* Read the requested page */
9457         readOff = targetPageOff;
9458         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
9459         {
9460                 char            fname[MAXFNAMELEN];
9461
9462                 XLogFileName(fname, curFileTLI, readSegNo);
9463                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
9464                                 (errcode_for_file_access(),
9465                                  errmsg("could not seek in log segment %s to offset %u: %m",
9466                                                 fname, readOff)));
9467                 goto next_record_is_invalid;
9468         }
9469
9470         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
9471         {
9472                 char            fname[MAXFNAMELEN];
9473
9474                 XLogFileName(fname, curFileTLI, readSegNo);
9475                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
9476                                 (errcode_for_file_access(),
9477                                  errmsg("could not read from log segment %s, offset %u: %m",
9478                                                 fname, readOff)));
9479                 goto next_record_is_invalid;
9480         }
9481
9482         Assert(targetSegNo == readSegNo);
9483         Assert(targetPageOff == readOff);
9484         Assert(reqLen <= readLen);
9485
9486         *readTLI = curFileTLI;
9487         return readLen;
9488
9489 next_record_is_invalid:
9490         lastSourceFailed = true;
9491
9492         if (readFile >= 0)
9493                 close(readFile);
9494         readFile = -1;
9495         readLen = 0;
9496         readSource = 0;
9497
9498         /* In standby-mode, keep trying */
9499         if (StandbyMode)
9500                 goto retry;
9501         else
9502                 return -1;
9503 }
9504
9505 /*
9506  * Open the WAL segment containing WAL position 'RecPtr'.
9507  *
9508  * The segment can be fetched via restore_command, or via walreceiver having
9509  * streamed the record, or it can already be present in pg_xlog. Checking
9510  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
9511  * too, in case someone copies a new segment directly to pg_xlog. That is not
9512  * documented or recommended, though.
9513  *
9514  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
9515  * prepare to read WAL starting from RedoStartLSN after this.
9516  *
9517  * 'RecPtr' might not point to the beginning of the record we're interested
9518  * in, it might also point to the page or segment header. In that case,
9519  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
9520  * used to decide which timeline to stream the requested WAL from.
9521  *
9522  * If the the record is not immediately available, the function returns false
9523  * if we're not in standby mode. In standby mode, waits for it to become
9524  * available.
9525  *
9526  * When the requested record becomes available, the function opens the file
9527  * containing it (if not open already), and returns true. When end of standby
9528  * mode is triggered by the user, and there is no more WAL available, returns
9529  * false.
9530  */
9531 static bool
9532 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
9533                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
9534 {
9535         static pg_time_t last_fail_time = 0;
9536         pg_time_t       now;
9537
9538         /*-------
9539          * Standby mode is implemented by a state machine:
9540          *
9541          * 1. Read from archive (XLOG_FROM_ARCHIVE)
9542          * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
9543          * 3. Check trigger file
9544          * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
9545          * 5. Rescan timelines
9546          * 6. Sleep 5 seconds, and loop back to 1.
9547          *
9548          * Failure to read from the current source advances the state machine to
9549          * the next state. In addition, successfully reading a file from pg_xlog
9550          * moves the state machine from state 2 back to state 1 (we always prefer
9551          * files in the archive over files in pg_xlog).
9552          *
9553          * 'currentSource' indicates the current state. There are no currentSource
9554          * values for "check trigger", "rescan timelines", and "sleep" states,
9555          * those actions are taken when reading from the previous source fails, as
9556          * part of advancing to the next state.
9557          *-------
9558          */
9559         if (!InArchiveRecovery)
9560                 currentSource = XLOG_FROM_PG_XLOG;
9561         else if (currentSource == 0)
9562                 currentSource = XLOG_FROM_ARCHIVE;
9563
9564         for (;;)
9565         {
9566                 int                     oldSource = currentSource;
9567
9568                 /*
9569                  * First check if we failed to read from the current source, and
9570                  * advance the state machine if so. The failure to read might've
9571                  * happened outside this function, e.g when a CRC check fails on a
9572                  * record, or within this loop.
9573                  */
9574                 if (lastSourceFailed)
9575                 {
9576                         switch (currentSource)
9577                         {
9578                                 case XLOG_FROM_ARCHIVE:
9579                                         currentSource = XLOG_FROM_PG_XLOG;
9580                                         break;
9581
9582                                 case XLOG_FROM_PG_XLOG:
9583
9584                                         /*
9585                                          * Check to see if the trigger file exists. Note that we
9586                                          * do this only after failure, so when you create the
9587                                          * trigger file, we still finish replaying as much as we
9588                                          * can from archive and pg_xlog before failover.
9589                                          */
9590                                         if (StandbyMode && CheckForStandbyTrigger())
9591                                         {
9592                                                 ShutdownWalRcv();
9593                                                 return false;
9594                                         }
9595
9596                                         /*
9597                                          * Not in standby mode, and we've now tried the archive
9598                                          * and pg_xlog.
9599                                          */
9600                                         if (!StandbyMode)
9601                                                 return false;
9602
9603                                         /*
9604                                          * If primary_conninfo is set, launch walreceiver to try
9605                                          * to stream the missing WAL.
9606                                          *
9607                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
9608                                          * checkpoint location. In that case, we use RedoStartLSN
9609                                          * as the streaming start position instead of RecPtr, so
9610                                          * that when we later jump backwards to start redo at
9611                                          * RedoStartLSN, we will have the logs streamed already.
9612                                          */
9613                                         if (PrimaryConnInfo)
9614                                         {
9615                                                 XLogRecPtr      ptr;
9616                                                 TimeLineID      tli;
9617
9618                                                 if (fetching_ckpt)
9619                                                 {
9620                                                         ptr = RedoStartLSN;
9621                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
9622                                                 }
9623                                                 else
9624                                                 {
9625                                                         ptr = tliRecPtr;
9626                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
9627
9628                                                         if (curFileTLI > 0 && tli < curFileTLI)
9629                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
9630                                                                          (uint32) (ptr >> 32), (uint32) ptr,
9631                                                                          tli, curFileTLI);
9632                                                 }
9633                                                 curFileTLI = tli;
9634                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo);
9635                                                 receivedUpto = 0;
9636                                         }
9637
9638                                         /*
9639                                          * Move to XLOG_FROM_STREAM state in either case. We'll
9640                                          * get immediate failure if we didn't launch walreceiver,
9641                                          * and move on to the next state.
9642                                          */
9643                                         currentSource = XLOG_FROM_STREAM;
9644                                         break;
9645
9646                                 case XLOG_FROM_STREAM:
9647
9648                                         /*
9649                                          * Failure while streaming. Most likely, we got here
9650                                          * because streaming replication was terminated, or
9651                                          * promotion was triggered. But we also get here if we
9652                                          * find an invalid record in the WAL streamed from master,
9653                                          * in which case something is seriously wrong. There's
9654                                          * little chance that the problem will just go away, but
9655                                          * PANIC is not good for availability either, especially
9656                                          * in hot standby mode. So, we treat that the same as
9657                                          * disconnection, and retry from archive/pg_xlog again.
9658                                          * The WAL in the archive should be identical to what was
9659                                          * streamed, so it's unlikely that it helps, but one can
9660                                          * hope...
9661                                          */
9662
9663                                         /*
9664                                          * Before we leave XLOG_FROM_STREAM state, make sure that
9665                                          * walreceiver is not active, so that it won't overwrite
9666                                          * WAL that we restore from archive.
9667                                          */
9668                                         if (WalRcvStreaming())
9669                                                 ShutdownWalRcv();
9670
9671                                         /*
9672                                          * Before we sleep, re-scan for possible new timelines if
9673                                          * we were requested to recover to the latest timeline.
9674                                          */
9675                                         if (recoveryTargetIsLatest)
9676                                         {
9677                                                 if (rescanLatestTimeLine())
9678                                                 {
9679                                                         currentSource = XLOG_FROM_ARCHIVE;
9680                                                         break;
9681                                                 }
9682                                         }
9683
9684                                         /*
9685                                          * XLOG_FROM_STREAM is the last state in our state
9686                                          * machine, so we've exhausted all the options for
9687                                          * obtaining the requested WAL. We're going to loop back
9688                                          * and retry from the archive, but if it hasn't been long
9689                                          * since last attempt, sleep 5 seconds to avoid
9690                                          * busy-waiting.
9691                                          */
9692                                         now = (pg_time_t) time(NULL);
9693                                         if ((now - last_fail_time) < 5)
9694                                         {
9695                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
9696                                                 now = (pg_time_t) time(NULL);
9697                                         }
9698                                         last_fail_time = now;
9699                                         currentSource = XLOG_FROM_ARCHIVE;
9700                                         break;
9701
9702                                 default:
9703                                         elog(ERROR, "unexpected WAL source %d", currentSource);
9704                         }
9705                 }
9706                 else if (currentSource == XLOG_FROM_PG_XLOG)
9707                 {
9708                         /*
9709                          * We just successfully read a file in pg_xlog. We prefer files in
9710                          * the archive over ones in pg_xlog, so try the next file again
9711                          * from the archive first.
9712                          */
9713                         if (InArchiveRecovery)
9714                                 currentSource = XLOG_FROM_ARCHIVE;
9715                 }
9716
9717                 if (currentSource != oldSource)
9718                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
9719                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
9720                                  lastSourceFailed ? "failure" : "success");
9721
9722                 /*
9723                  * We've now handled possible failure. Try to read from the chosen
9724                  * source.
9725                  */
9726                 lastSourceFailed = false;
9727
9728                 switch (currentSource)
9729                 {
9730                         case XLOG_FROM_ARCHIVE:
9731                         case XLOG_FROM_PG_XLOG:
9732                                 /* Close any old file we might have open. */
9733                                 if (readFile >= 0)
9734                                 {
9735                                         close(readFile);
9736                                         readFile = -1;
9737                                 }
9738                                 /* Reset curFileTLI if random fetch. */
9739                                 if (randAccess)
9740                                         curFileTLI = 0;
9741
9742                                 /*
9743                                  * Try to restore the file from archive, or read an existing
9744                                  * file from pg_xlog.
9745                                  */
9746                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
9747                                 if (readFile >= 0)
9748                                         return true;    /* success! */
9749
9750                                 /*
9751                                  * Nope, not found in archive or pg_xlog.
9752                                  */
9753                                 lastSourceFailed = true;
9754                                 break;
9755
9756                         case XLOG_FROM_STREAM:
9757                                 {
9758                                         bool            havedata;
9759
9760                                         /*
9761                                          * Check if WAL receiver is still active.
9762                                          */
9763                                         if (!WalRcvStreaming())
9764                                         {
9765                                                 lastSourceFailed = true;
9766                                                 break;
9767                                         }
9768
9769                                         /*
9770                                          * Walreceiver is active, so see if new data has arrived.
9771                                          *
9772                                          * We only advance XLogReceiptTime when we obtain fresh
9773                                          * WAL from walreceiver and observe that we had already
9774                                          * processed everything before the most recent "chunk"
9775                                          * that it flushed to disk.  In steady state where we are
9776                                          * keeping up with the incoming data, XLogReceiptTime will
9777                                          * be updated on each cycle. When we are behind,
9778                                          * XLogReceiptTime will not advance, so the grace time
9779                                          * allotted to conflicting queries will decrease.
9780                                          */
9781                                         if (RecPtr < receivedUpto)
9782                                                 havedata = true;
9783                                         else
9784                                         {
9785                                                 XLogRecPtr      latestChunkStart;
9786
9787                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
9788                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
9789                                                 {
9790                                                         havedata = true;
9791                                                         if (latestChunkStart <= RecPtr)
9792                                                         {
9793                                                                 XLogReceiptTime = GetCurrentTimestamp();
9794                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
9795                                                         }
9796                                                 }
9797                                                 else
9798                                                         havedata = false;
9799                                         }
9800                                         if (havedata)
9801                                         {
9802                                                 /*
9803                                                  * Great, streamed far enough.  Open the file if it's
9804                                                  * not open already.  Also read the timeline history
9805                                                  * file if we haven't initialized timeline history
9806                                                  * yet; it should be streamed over and present in
9807                                                  * pg_xlog by now.      Use XLOG_FROM_STREAM so that
9808                                                  * source info is set correctly and XLogReceiptTime
9809                                                  * isn't changed.
9810                                                  */
9811                                                 if (readFile < 0)
9812                                                 {
9813                                                         if (!expectedTLEs)
9814                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
9815                                                         readFile = XLogFileRead(readSegNo, PANIC,
9816                                                                                                         receiveTLI,
9817                                                                                                         XLOG_FROM_STREAM, false);
9818                                                         Assert(readFile >= 0);
9819                                                 }
9820                                                 else
9821                                                 {
9822                                                         /* just make sure source info is correct... */
9823                                                         readSource = XLOG_FROM_STREAM;
9824                                                         XLogReceiptSource = XLOG_FROM_STREAM;
9825                                                         return true;
9826                                                 }
9827                                                 break;
9828                                         }
9829
9830                                         /*
9831                                          * Data not here yet. Check for trigger, then wait for
9832                                          * walreceiver to wake us up when new WAL arrives.
9833                                          */
9834                                         if (CheckForStandbyTrigger())
9835                                         {
9836                                                 /*
9837                                                  * Note that we don't "return false" immediately here.
9838                                                  * After being triggered, we still want to replay all
9839                                                  * the WAL that was already streamed. It's in pg_xlog
9840                                                  * now, so we just treat this as a failure, and the
9841                                                  * state machine will move on to replay the streamed
9842                                                  * WAL from pg_xlog, and then recheck the trigger and
9843                                                  * exit replay.
9844                                                  */
9845                                                 lastSourceFailed = true;
9846                                                 break;
9847                                         }
9848
9849                                         /*
9850                                          * Wait for more WAL to arrive. Time out after 5 seconds,
9851                                          * like when polling the archive, to react to a trigger
9852                                          * file promptly.
9853                                          */
9854                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
9855                                                           WL_LATCH_SET | WL_TIMEOUT,
9856                                                           5000L);
9857                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
9858                                         break;
9859                                 }
9860
9861                         default:
9862                                 elog(ERROR, "unexpected WAL source %d", currentSource);
9863                 }
9864
9865                 /*
9866                  * This possibly-long loop needs to handle interrupts of startup
9867                  * process.
9868                  */
9869                 HandleStartupProcInterrupts();
9870         } while (StandbyMode);
9871
9872         return false;
9873 }
9874
9875 /*
9876  * Determine what log level should be used to report a corrupt WAL record
9877  * in the current WAL page, previously read by XLogPageRead().
9878  *
9879  * 'emode' is the error mode that would be used to report a file-not-found
9880  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
9881  * we're retrying the exact same record that we've tried previously, only
9882  * complain the first time to keep the noise down.      However, we only do when
9883  * reading from pg_xlog, because we don't expect any invalid records in archive
9884  * or in records streamed from master. Files in the archive should be complete,
9885  * and we should never hit the end of WAL because we stop and wait for more WAL
9886  * to arrive before replaying it.
9887  *
9888  * NOTE: This function remembers the RecPtr value it was last called with,
9889  * to suppress repeated messages about the same record. Only call this when
9890  * you are about to ereport(), or you might cause a later message to be
9891  * erroneously suppressed.
9892  */
9893 static int
9894 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
9895 {
9896         static XLogRecPtr lastComplaint = 0;
9897
9898         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
9899         {
9900                 if (RecPtr == lastComplaint)
9901                         emode = DEBUG1;
9902                 else
9903                         lastComplaint = RecPtr;
9904         }
9905         return emode;
9906 }
9907
9908 /*
9909  * Check to see whether the user-specified trigger file exists and whether a
9910  * promote request has arrived.  If either condition holds, return true.
9911  */
9912 static bool
9913 CheckForStandbyTrigger(void)
9914 {
9915         struct stat stat_buf;
9916         static bool triggered = false;
9917
9918         if (triggered)
9919                 return true;
9920
9921         if (IsPromoteTriggered())
9922         {
9923                 /*
9924                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
9925                  * signal handler. We now leave the file in place and let the Startup
9926                  * process do the unlink. This allows Startup to know whether we're
9927                  * doing fast or normal promotion. Fast promotion takes precedence.
9928                  */
9929                 if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9930                 {
9931                         unlink(FAST_PROMOTE_SIGNAL_FILE);
9932                         unlink(PROMOTE_SIGNAL_FILE);
9933                         fast_promote = true;
9934                 }
9935                 else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9936                 {
9937                         unlink(PROMOTE_SIGNAL_FILE);
9938                         fast_promote = false;
9939                 }
9940
9941                 ereport(LOG, (errmsg("received promote request")));
9942
9943                 ResetPromoteTriggered();
9944                 triggered = true;
9945                 return true;
9946         }
9947
9948         if (TriggerFile == NULL)
9949                 return false;
9950
9951         if (stat(TriggerFile, &stat_buf) == 0)
9952         {
9953                 ereport(LOG,
9954                                 (errmsg("trigger file found: %s", TriggerFile)));
9955                 unlink(TriggerFile);
9956                 triggered = true;
9957                 fast_promote = true;
9958                 return true;
9959         }
9960         return false;
9961 }
9962
9963 /*
9964  * Check to see if a promote request has arrived. Should be
9965  * called by postmaster after receiving SIGUSR1.
9966  */
9967 bool
9968 CheckPromoteSignal(void)
9969 {
9970         struct stat stat_buf;
9971
9972         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
9973                 stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9974                 return true;
9975
9976         return false;
9977 }
9978
9979 /*
9980  * Wake up startup process to replay newly arrived WAL, or to notice that
9981  * failover has been requested.
9982  */
9983 void
9984 WakeupRecovery(void)
9985 {
9986         SetLatch(&XLogCtl->recoveryWakeupLatch);
9987 }
9988
9989 /*
9990  * Update the WalWriterSleeping flag.
9991  */
9992 void
9993 SetWalWriterSleeping(bool sleeping)
9994 {
9995         /* use volatile pointer to prevent code rearrangement */
9996         volatile XLogCtlData *xlogctl = XLogCtl;
9997
9998         SpinLockAcquire(&xlogctl->info_lck);
9999         xlogctl->WalWriterSleeping = sleeping;
10000         SpinLockRelease(&xlogctl->info_lck);
10001 }