granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <time.h>
  19 #include <fcntl.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <unistd.h>
  23
  24 #include "access/clog.h"
  25 #include "access/multixact.h"
  26 #include "access/rewriteheap.h"
  27 #include "access/subtrans.h"
  28 #include "access/timeline.h"
  29 #include "access/transam.h"
  30 #include "access/tuptoaster.h"
  31 #include "access/twophase.h"
  32 #include "access/xact.h"
  33 #include "access/xlog_internal.h"
  34 #include "access/xlogreader.h"
  35 #include "access/xlogutils.h"
  36 #include "catalog/catversion.h"
  37 #include "catalog/pg_control.h"
  38 #include "catalog/pg_database.h"
  39 #include "miscadmin.h"
  40 #include "pgstat.h"
  41 #include "postmaster/bgwriter.h"
  42 #include "postmaster/startup.h"
  43 #include "replication/logical.h"
  44 #include "replication/slot.h"
  45 #include "replication/snapbuild.h"
  46 #include "replication/walreceiver.h"
  47 #include "replication/walsender.h"
  48 #include "storage/barrier.h"
  49 #include "storage/bufmgr.h"
  50 #include "storage/fd.h"
  51 #include "storage/ipc.h"
  52 #include "storage/latch.h"
  53 #include "storage/pmsignal.h"
  54 #include "storage/predicate.h"
  55 #include "storage/proc.h"
  56 #include "storage/procarray.h"
  57 #include "storage/reinit.h"
  58 #include "storage/smgr.h"
  59 #include "storage/spin.h"
  60 #include "utils/builtins.h"
  61 #include "utils/guc.h"
  62 #include "utils/ps_status.h"
  63 #include "utils/relmapper.h"
  64 #include "utils/snapmgr.h"
  65 #include "utils/timestamp.h"
  66 #include "pg_trace.h"
  67
  68 extern uint32 bootstrap_data_checksum_version;
  69
  70 /* File path names (all relative to $PGDATA) */
  71 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  72 #define RECOVERY_COMMAND_DONE   "recovery.done"
  73 #define PROMOTE_SIGNAL_FILE             "promote"
  74 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
  75
  76
  77 /* User-settable parameters */
  78 int                     CheckPointSegments = 3;
  79 int                     wal_keep_segments = 0;
  80 int                     XLOGbuffers = -1;
  81 int                     XLogArchiveTimeout = 0;
  82 bool            XLogArchiveMode = false;
  83 char       *XLogArchiveCommand = NULL;
  84 bool            EnableHotStandby = false;
  85 bool            fullPageWrites = true;
  86 bool            wal_log_hints = false;
  87 bool            log_checkpoints = false;
  88 int                     sync_method = DEFAULT_SYNC_METHOD;
  89 int                     wal_level = WAL_LEVEL_MINIMAL;
  90 int                     CommitDelay = 0;        /* precommit delay in microseconds */
  91 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  92 int                     num_xloginsert_locks = 8;
  93
  94 #ifdef WAL_DEBUG
  95 bool            XLOG_DEBUG = false;
  96 #endif
  97
  98 /*
  99  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
 100  * When we are done with an old XLOG segment file, we will recycle it as a
 101  * future XLOG segment as long as there aren't already XLOGfileslop future
 102  * segments; else we'll delete it.  This could be made a separate GUC
 103  * variable, but at present I think it's sufficient to hardwire it as
 104  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
 105  * no more than 2*CheckPointSegments log segments, and we want to recycle all
 106  * of them; the +1 allows boundary cases to happen without wasting a
 107  * delete/create-segment cycle.
 108  */
 109 #define XLOGfileslop    (2*CheckPointSegments + 1)
 110
 111
 112 /*
 113  * GUC support
 114  */
 115 const struct config_enum_entry sync_method_options[] = {
 116         {"fsync", SYNC_METHOD_FSYNC, false},
 117 #ifdef HAVE_FSYNC_WRITETHROUGH
 118         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 119 #endif
 120 #ifdef HAVE_FDATASYNC
 121         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 122 #endif
 123 #ifdef OPEN_SYNC_FLAG
 124         {"open_sync", SYNC_METHOD_OPEN, false},
 125 #endif
 126 #ifdef OPEN_DATASYNC_FLAG
 127         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 128 #endif
 129         {NULL, 0, false}
 130 };
 131
 132 /*
 133  * Statistics for current checkpoint are collected in this global struct.
 134  * Because only the checkpointer or a stand-alone backend can perform
 135  * checkpoints, this will be unused in normal backends.
 136  */
 137 CheckpointStatsData CheckpointStats;
 138
 139 /*
 140  * ThisTimeLineID will be same in all backends --- it identifies current
 141  * WAL timeline for the database system.
 142  */
 143 TimeLineID      ThisTimeLineID = 0;
 144
 145 /*
 146  * Are we doing recovery from XLOG?
 147  *
 148  * This is only ever true in the startup process; it should be read as meaning
 149  * "this process is replaying WAL records", rather than "the system is in
 150  * recovery mode".  It should be examined primarily by functions that need
 151  * to act differently when called from a WAL redo function (e.g., to skip WAL
 152  * logging).  To check whether the system is in recovery regardless of which
 153  * process you're running in, use RecoveryInProgress() but only after shared
 154  * memory startup and lock initialization.
 155  */
 156 bool            InRecovery = false;
 157
 158 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 159 HotStandbyState standbyState = STANDBY_DISABLED;
 160
 161 static XLogRecPtr LastRec;
 162
 163 /* Local copy of WalRcv->receivedUpto */
 164 static XLogRecPtr receivedUpto = 0;
 165 static TimeLineID receiveTLI = 0;
 166
 167 /*
 168  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 169  * the replayed WAL records indicate. It's initialized with full_page_writes
 170  * that the recovery starting checkpoint record indicates, and then updated
 171  * each time XLOG_FPW_CHANGE record is replayed.
 172  */
 173 static bool lastFullPageWrites;
 174
 175 /*
 176  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 177  * known, need to check the shared state".
 178  */
 179 static bool LocalRecoveryInProgress = true;
 180
 181 /*
 182  * Local copy of SharedHotStandbyActive variable. False actually means "not
 183  * known, need to check the shared state".
 184  */
 185 static bool LocalHotStandbyActive = false;
 186
 187 /*
 188  * Local state for XLogInsertAllowed():
 189  *              1: unconditionally allowed to insert XLOG
 190  *              0: unconditionally not allowed to insert XLOG
 191  *              -1: must check RecoveryInProgress(); disallow until it is false
 192  * Most processes start with -1 and transition to 1 after seeing that recovery
 193  * is not in progress.  But we can also force the value for special cases.
 194  * The coding in XLogInsertAllowed() depends on the first two of these states
 195  * being numerically the same as bool true and false.
 196  */
 197 static int      LocalXLogInsertAllowed = -1;
 198
 199 /*
 200  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 201  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 202  * currently recovering using offline XLOG archives. These variables are only
 203  * valid in the startup process.
 204  *
 205  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 206  * currently performing crash recovery using only XLOG files in pg_xlog, but
 207  * will switch to using offline XLOG archives as soon as we reach the end of
 208  * WAL in pg_xlog.
 209 */
 210 bool            ArchiveRecoveryRequested = false;
 211 bool            InArchiveRecovery = false;
 212
 213 /* Was the last xlog file restored from archive, or local? */
 214 static bool restoredFromArchive = false;
 215
 216 /* options taken from recovery.conf for archive recovery */
 217 char       *recoveryRestoreCommand = NULL;
 218 static char *recoveryEndCommand = NULL;
 219 static char *archiveCleanupCommand = NULL;
 220 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 221 static bool recoveryTargetInclusive = true;
 222 static bool recoveryPauseAtTarget = true;
 223 static TransactionId recoveryTargetXid;
 224 static TimestampTz recoveryTargetTime;
 225 static char *recoveryTargetName;
 226 static int min_recovery_apply_delay = 0;
 227 static TimestampTz recoveryDelayUntilTime;
 228
 229 /* options taken from recovery.conf for XLOG streaming */
 230 static bool StandbyModeRequested = false;
 231 static char *PrimaryConnInfo = NULL;
 232 static char *PrimarySlotName = NULL;
 233 static char *TriggerFile = NULL;
 234
 235 /* are we currently in standby mode? */
 236 bool            StandbyMode = false;
 237
 238 /* whether request for fast promotion has been made yet */
 239 static bool fast_promote = false;
 240
 241 /*
 242  * if recoveryStopsBefore/After returns true, it saves information of the stop
 243  * point here
 244  */
 245 static TransactionId recoveryStopXid;
 246 static TimestampTz recoveryStopTime;
 247 static char recoveryStopName[MAXFNAMELEN];
 248 static bool recoveryStopAfter;
 249
 250 /*
 251  * During normal operation, the only timeline we care about is ThisTimeLineID.
 252  * During recovery, however, things are more complicated.  To simplify life
 253  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 254  * scan through the WAL history (that is, it is the line that was active when
 255  * the currently-scanned WAL record was generated).  We also need these
 256  * timeline values:
 257  *
 258  * recoveryTargetTLI: the desired timeline that we want to end in.
 259  *
 260  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 261  *
 262  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 263  * its known parents, newest first (so recoveryTargetTLI is always the
 264  * first list member).  Only these TLIs are expected to be seen in the WAL
 265  * segments we read, and indeed only these TLIs will be considered as
 266  * candidate WAL files to open at all.
 267  *
 268  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 269  * (This is not necessarily the same as ThisTimeLineID, because we could
 270  * be scanning data that was copied from an ancestor timeline when the current
 271  * file was created.)  During a sequential scan we do not allow this value
 272  * to decrease.
 273  */
 274 static TimeLineID recoveryTargetTLI;
 275 static bool recoveryTargetIsLatest = false;
 276 static List *expectedTLEs;
 277 static TimeLineID curFileTLI;
 278
 279 /*
 280  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 281  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 282  * end+1 of the last record, and is reset when we end a top-level transaction,
 283  * or start a new one; so it can be used to tell if the current transaction has
 284  * created any XLOG records.
 285  */
 286 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
 287
 288 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 289
 290 /*
 291  * RedoRecPtr is this backend's local copy of the REDO record pointer
 292  * (which is almost but not quite the same as a pointer to the most recent
 293  * CHECKPOINT record).  We update this from the shared-memory copy,
 294  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 295  * hold an insertion lock).  See XLogInsert for details.  We are also allowed
 296  * to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 297  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 298  * InitXLOGAccess.
 299  */
 300 static XLogRecPtr RedoRecPtr;
 301
 302 /*
 303  * RedoStartLSN points to the checkpoint's REDO location which is specified
 304  * in a backup label file, backup history file or control file. In standby
 305  * mode, XLOG streaming usually starts from the position where an invalid
 306  * record was found. But if we fail to read even the initial checkpoint
 307  * record, we use the REDO location instead of the checkpoint location as
 308  * the start position of XLOG streaming. Otherwise we would have to jump
 309  * backwards to the REDO location after reading the checkpoint record,
 310  * because the REDO record can precede the checkpoint record.
 311  */
 312 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 313
 314 /*----------
 315  * Shared-memory data structures for XLOG control
 316  *
 317  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 318  * the log up to (all records before that point must be written or fsynced).
 319  * LogwrtResult indicates the byte positions we have already written/fsynced.
 320  * These structs are identical but are declared separately to indicate their
 321  * slightly different functions.
 322  *
 323  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 324  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 325  * this arrangement is that the value can be examined by code that already
 326  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 327  * to the shared variable, each backend has a private copy of LogwrtResult,
 328  * which is updated when convenient.
 329  *
 330  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 331  * (protected by info_lck), but we don't need to cache any copies of it.
 332  *
 333  * info_lck is only held long enough to read/update the protected variables,
 334  * so it's a plain spinlock.  The other locks are held longer (potentially
 335  * over I/O operations), so we use LWLocks for them.  These locks are:
 336  *
 337  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 338  * It is only held while initializing and changing the mapping.  If the
 339  * contents of the buffer being replaced haven't been written yet, the mapping
 340  * lock is released while the write is done, and reacquired afterwards.
 341  *
 342  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 343  * XLogFlush).
 344  *
 345  * ControlFileLock: must be held to read/update control file or create
 346  * new log file.
 347  *
 348  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 349  * only one checkpointer at a time; currently, with all checkpoints done by
 350  * the checkpointer, this is just pro forma).
 351  *
 352  *----------
 353  */
 354
 355 typedef struct XLogwrtRqst
 356 {
 357         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 358         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 359 } XLogwrtRqst;
 360
 361 typedef struct XLogwrtResult
 362 {
 363         XLogRecPtr      Write;                  /* last byte + 1 written out */
 364         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 365 } XLogwrtResult;
 366
 367 /*
 368  * Inserting to WAL is protected by a small fixed number of WAL insertion
 369  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
 370  * matter which one. To lock out other concurrent insertions, you must hold
 371  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
 372  * indicator of how far the insertion has progressed (insertingAt).
 373  *
 374  * The insertingAt values are read when a process wants to flush WAL from
 375  * the in-memory buffers to disk, to check that all the insertions to the
 376  * region the process is about to write out have finished. You could simply
 377  * wait for all currently in-progress insertions to finish, but the
 378  * insertingAt indicator allows you to ignore insertions to later in the WAL,
 379  * so that you only wait for the insertions that are modifying the buffers
 380  * you're about to write out.
 381  *
 382  * This isn't just an optimization. If all the WAL buffers are dirty, an
 383  * inserter that's holding a WAL insert lock might need to evict an old WAL
 384  * buffer, which requires flushing the WAL. If it's possible for an inserter
 385  * to block on another inserter unnecessarily, deadlock can arise when two
 386  * inserters holding a WAL insert lock wait for each other to finish their
 387  * insertion.
 388  *
 389  * Small WAL records that don't cross a page boundary never update the value,
 390  * the WAL record is just copied to the page and the lock is released. But
 391  * to avoid the deadlock-scenario explained above, the indicator is always
 392  * updated before sleeping while holding an insertion lock.
 393  */
 394 typedef struct
 395 {
 396         LWLock          lock;
 397         XLogRecPtr      insertingAt;
 398 } WALInsertLock;
 399
 400 /*
 401  * All the WAL insertion locks are allocated as an array in shared memory. We
 402  * force the array stride to be a power of 2, which saves a few cycles in
 403  * indexing, but more importantly also ensures that individual slots don't
 404  * cross cache line boundaries. (Of course, we have to also ensure that the
 405  * array start address is suitably aligned.)
 406  */
 407 typedef union WALInsertLockPadded
 408 {
 409         WALInsertLock l;
 410         char            pad[CACHE_LINE_SIZE];
 411 } WALInsertLockPadded;
 412
 413 /*
 414  * Shared state data for XLogInsert.
 415  */
 416 typedef struct XLogCtlInsert
 417 {
 418         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 419
 420         /*
 421          * CurrBytePos is the end of reserved WAL. The next record will be inserted
 422          * at that position. PrevBytePos is the start position of the previously
 423          * inserted (or rather, reserved) record - it is copied to the prev-link
 424          * of the next record. These are stored as "usable byte positions" rather
 425          * than XLogRecPtrs (see XLogBytePosToRecPtr()).
 426          */
 427         uint64          CurrBytePos;
 428         uint64          PrevBytePos;
 429
 430         /*
 431          * Make sure the above heavily-contended spinlock and byte positions are
 432          * on their own cache line. In particular, the RedoRecPtr and full page
 433          * write variables below should be on a different cache line. They are
 434          * read on every WAL insertion, but updated rarely, and we don't want
 435          * those reads to steal the cache line containing Curr/PrevBytePos.
 436          */
 437         char            pad[CACHE_LINE_SIZE];
 438
 439         /*
 440          * fullPageWrites is the master copy used by all backends to determine
 441          * whether to write full-page to WAL, instead of using process-local one.
 442          * This is required because, when full_page_writes is changed by SIGHUP,
 443          * we must WAL-log it before it actually affects WAL-logging by backends.
 444          * Checkpointer sets at startup or after SIGHUP.
 445          *
 446          * To read these fields, you must hold an insertion lock. To modify them,
 447          * you must hold ALL the locks.
 448          */
 449         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 450         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 451         bool            fullPageWrites;
 452
 453         /*
 454          * exclusiveBackup is true if a backup started with pg_start_backup() is
 455          * in progress, and nonExclusiveBackups is a counter indicating the number
 456          * of streaming base backups currently in progress. forcePageWrites is set
 457          * to true when either of these is non-zero. lastBackupStart is the latest
 458          * checkpoint redo location used as a starting point for an online backup.
 459          */
 460         bool            exclusiveBackup;
 461         int                     nonExclusiveBackups;
 462         XLogRecPtr      lastBackupStart;
 463
 464         /*
 465          * WAL insertion locks.
 466          */
 467         WALInsertLockPadded     *WALInsertLocks;
 468         LWLockTranche WALInsertLockTranche;
 469         int                     WALInsertLockTrancheId;
 470 } XLogCtlInsert;
 471
 472 /*
 473  * Total shared-memory state for XLOG.
 474  */
 475 typedef struct XLogCtlData
 476 {
 477         XLogCtlInsert Insert;
 478
 479         /* Protected by info_lck: */
 480         XLogwrtRqst LogwrtRqst;
 481         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 482         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 483         TransactionId ckptXid;
 484         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 485         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
 486
 487         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
 488                                                                                  * segment */
 489
 490         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
 491         XLogRecPtr      unloggedLSN;
 492         slock_t         ulsn_lck;
 493
 494         /* Time of last xlog segment switch. Protected by WALWriteLock. */
 495         pg_time_t       lastSegSwitchTime;
 496
 497         /*
 498          * Protected by info_lck and WALWriteLock (you must hold either lock to
 499          * read it, but both to update)
 500          */
 501         XLogwrtResult LogwrtResult;
 502
 503         /*
 504          * Latest initialized page in the cache (last byte position + 1).
 505          *
 506          * To change the identity of a buffer (and InitializedUpTo), you need to
 507          * hold WALBufMappingLock.  To change the identity of a buffer that's still
 508          * dirty, the old page needs to be written out first, and for that you
 509          * need WALWriteLock, and you need to ensure that there are no in-progress
 510          * insertions to the page by calling WaitXLogInsertionsToFinish().
 511          */
 512         XLogRecPtr      InitializedUpTo;
 513
 514         /*
 515          * These values do not change after startup, although the pointed-to pages
 516          * and xlblocks values certainly do.  xlblock values are protected by
 517          * WALBufMappingLock.
 518          */
 519         char       *pages;                      /* buffers for unwritten XLOG pages */
 520         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 521         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 522
 523         /*
 524          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 525          * If we created a new timeline when the system was started up,
 526          * PrevTimeLineID is the old timeline's ID that we forked off from.
 527          * Otherwise it's equal to ThisTimeLineID.
 528          */
 529         TimeLineID      ThisTimeLineID;
 530         TimeLineID      PrevTimeLineID;
 531
 532         /*
 533          * archiveCleanupCommand is read from recovery.conf but needs to be in
 534          * shared memory so that the checkpointer process can access it.
 535          */
 536         char            archiveCleanupCommand[MAXPGPATH];
 537
 538         /*
 539          * SharedRecoveryInProgress indicates if we're still in crash or archive
 540          * recovery.  Protected by info_lck.
 541          */
 542         bool            SharedRecoveryInProgress;
 543
 544         /*
 545          * SharedHotStandbyActive indicates if we're still in crash or archive
 546          * recovery.  Protected by info_lck.
 547          */
 548         bool            SharedHotStandbyActive;
 549
 550         /*
 551          * WalWriterSleeping indicates whether the WAL writer is currently in
 552          * low-power mode (and hence should be nudged if an async commit occurs).
 553          * Protected by info_lck.
 554          */
 555         bool            WalWriterSleeping;
 556
 557         /*
 558          * recoveryWakeupLatch is used to wake up the startup process to continue
 559          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 560          * to appear.
 561          */
 562         Latch           recoveryWakeupLatch;
 563
 564         /*
 565          * During recovery, we keep a copy of the latest checkpoint record here.
 566          * Used by the background writer when it wants to create a restartpoint.
 567          *
 568          * Protected by info_lck.
 569          */
 570         XLogRecPtr      lastCheckPointRecPtr;
 571         CheckPoint      lastCheckPoint;
 572
 573         /*
 574          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 575          * replayed. When we're currently replaying a record, ie. in a redo
 576          * function, replayEndRecPtr points to the end+1 of the record being
 577          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 578          */
 579         XLogRecPtr      lastReplayedEndRecPtr;
 580         TimeLineID      lastReplayedTLI;
 581         XLogRecPtr      replayEndRecPtr;
 582         TimeLineID      replayEndTLI;
 583         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 584         TimestampTz recoveryLastXTime;
 585         /* current effective recovery target timeline */
 586         TimeLineID      RecoveryTargetTLI;
 587
 588         /*
 589          * timestamp of when we started replaying the current chunk of WAL data,
 590          * only relevant for replication or archive recovery
 591          */
 592         TimestampTz currentChunkStartTime;
 593         /* Are we requested to pause recovery? */
 594         bool            recoveryPause;
 595
 596         /*
 597          * lastFpwDisableRecPtr points to the start of the last replayed
 598          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 599          */
 600         XLogRecPtr      lastFpwDisableRecPtr;
 601
 602         slock_t         info_lck;               /* locks shared variables shown above */
 603 } XLogCtlData;
 604
 605 static XLogCtlData *XLogCtl = NULL;
 606
 607 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
 608 static WALInsertLockPadded *WALInsertLocks = NULL;
 609
 610 /*
 611  * We maintain an image of pg_control in shared memory.
 612  */
 613 static ControlFileData *ControlFile = NULL;
 614
 615 /*
 616  * Calculate the amount of space left on the page after 'endptr'. Beware
 617  * multiple evaluation!
 618  */
 619 #define INSERT_FREESPACE(endptr)        \
 620         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 621
 622 /* Macro to advance to next buffer index. */
 623 #define NextBufIdx(idx)         \
 624                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 625
 626 /*
 627  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 628  * would hold if it was in cache, the page containing 'recptr'.
 629  */
 630 #define XLogRecPtrToBufIdx(recptr)      \
 631         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 632
 633 /*
 634  * These are the number of bytes in a WAL page and segment usable for WAL data.
 635  */
 636 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 637 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
 638
 639 /*
 640  * Private, possibly out-of-date copy of shared LogwrtResult.
 641  * See discussion above.
 642  */
 643 static XLogwrtResult LogwrtResult = {0, 0};
 644
 645 /*
 646  * Codes indicating where we got a WAL file from during recovery, or where
 647  * to attempt to get one.
 648  */
 649 typedef enum
 650 {
 651         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 652         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 653         XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
 654         XLOG_FROM_STREAM,                       /* streamed from master */
 655 } XLogSource;
 656
 657 /* human-readable names for XLogSources, for debugging output */
 658 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
 659
 660 /*
 661  * openLogFile is -1 or a kernel FD for an open log file segment.
 662  * When it's open, openLogOff is the current seek offset in the file.
 663  * openLogSegNo identifies the segment.  These variables are only
 664  * used to write the XLOG, and so will normally refer to the active segment.
 665  */
 666 static int      openLogFile = -1;
 667 static XLogSegNo openLogSegNo = 0;
 668 static uint32 openLogOff = 0;
 669
 670 /*
 671  * These variables are used similarly to the ones above, but for reading
 672  * the XLOG.  Note, however, that readOff generally represents the offset
 673  * of the page just read, not the seek position of the FD itself, which
 674  * will be just past that page. readLen indicates how much of the current
 675  * page has been read into readBuf, and readSource indicates where we got
 676  * the currently open file from.
 677  */
 678 static int      readFile = -1;
 679 static XLogSegNo readSegNo = 0;
 680 static uint32 readOff = 0;
 681 static uint32 readLen = 0;
 682 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
 683
 684 /*
 685  * Keeps track of which source we're currently reading from. This is
 686  * different from readSource in that this is always set, even when we don't
 687  * currently have a WAL file open. If lastSourceFailed is set, our last
 688  * attempt to read from currentSource failed, and we should try another source
 689  * next.
 690  */
 691 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 692 static bool lastSourceFailed = false;
 693
 694 typedef struct XLogPageReadPrivate
 695 {
 696         int                     emode;
 697         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 698         bool            randAccess;
 699 } XLogPageReadPrivate;
 700
 701 /*
 702  * These variables track when we last obtained some WAL data to process,
 703  * and where we got it from.  (XLogReceiptSource is initially the same as
 704  * readSource, but readSource gets reset to zero when we don't have data
 705  * to process right now.  It is also different from currentSource, which
 706  * also changes when we try to read from a source and fail, while
 707  * XLogReceiptSource tracks where we last successfully read some WAL.)
 708  */
 709 static TimestampTz XLogReceiptTime = 0;
 710 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
 711
 712 /* State information for XLOG reading */
 713 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 714 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 715
 716 static XLogRecPtr minRecoveryPoint;             /* local copy of
 717                                                                                  * ControlFile->minRecoveryPoint */
 718 static TimeLineID minRecoveryPointTLI;
 719 static bool updateMinRecoveryPoint = true;
 720
 721 /*
 722  * Have we reached a consistent database state? In crash recovery, we have
 723  * to replay all the WAL, so reachedConsistency is never set. During archive
 724  * recovery, the database is consistent once minRecoveryPoint is reached.
 725  */
 726 bool            reachedConsistency = false;
 727
 728 static bool InRedo = false;
 729
 730 /* Have we launched bgwriter during recovery? */
 731 static bool bgwriterLaunched = false;
 732
 733 /* For WALInsertLockAcquire/Release functions */
 734 static int      MyLockNo = 0;
 735 static bool holdingAllLocks = false;
 736
 737 static void readRecoveryCommandFile(void);
 738 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
 739 static bool recoveryStopsBefore(XLogRecord *record);
 740 static bool recoveryStopsAfter(XLogRecord *record);
 741 static void recoveryPausesHere(void);
 742 static bool recoveryApplyDelay(XLogRecord *record);
 743 static void SetLatestXTime(TimestampTz xtime);
 744 static void SetCurrentChunkStartTime(TimestampTz xtime);
 745 static void CheckRequiredParameterValues(void);
 746 static void XLogReportParameters(void);
 747 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 748                                         TimeLineID prevTLI);
 749 static void LocalSetXLogInsertAllowed(void);
 750 static void CreateEndOfRecoveryRecord(void);
 751 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 752 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 753 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 754
 755 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
 756                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 757 static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
 758                                                  char *blk, bool get_cleanup_lock, bool keep_buffer);
 759 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 760 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 761 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 762 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 763                                            bool find_free, int *max_advance,
 764                                            bool use_lock);
 765 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 766                          int source, bool notexistOk);
 767 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 768 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 769                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 770                          TimeLineID *readTLI);
 771 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 772                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 773 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 774 static void XLogFileClose(void);
 775 static void PreallocXlogFiles(XLogRecPtr endptr);
 776 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
 777 static void UpdateLastRemovedPtr(char *filename);
 778 static void ValidateXLOGDirectoryStructure(void);
 779 static void CleanupBackupHistory(void);
 780 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 781 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 782                    int emode, bool fetching_ckpt);
 783 static void CheckRecoveryConsistency(void);
 784 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 785                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 786 static bool rescanLatestTimeLine(void);
 787 static void WriteControlFile(void);
 788 static void ReadControlFile(void);
 789 static char *str_time(pg_time_t tnow);
 790 static bool CheckForStandbyTrigger(void);
 791
 792 #ifdef WAL_DEBUG
 793 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 794 #endif
 795 static void pg_start_backup_callback(int code, Datum arg);
 796 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 797                                   bool *backupEndRequired, bool *backupFromStandby);
 798 static void rm_redo_error_callback(void *arg);
 799 static int      get_sync_bit(int method);
 800
 801 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 802                                   XLogRecData *rdata,
 803                                   XLogRecPtr StartPos, XLogRecPtr EndPos);
 804 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 805                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 806 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 807                                   XLogRecPtr *PrevPtr);
 808 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 809 static char *GetXLogBuffer(XLogRecPtr ptr);
 810 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 811 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 812 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 813
 814 static void WALInsertLockAcquire(void);
 815 static void WALInsertLockAcquireExclusive(void);
 816 static void WALInsertLockRelease(void);
 817 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 818
 819 /*
 820  * Insert an XLOG record having the specified RMID and info bytes,
 821  * with the body of the record being the data chunk(s) described by
 822  * the rdata chain (see xlog.h for notes about rdata).
 823  *
 824  * Returns XLOG pointer to end of record (beginning of next record).
 825  * This can be used as LSN for data pages affected by the logged action.
 826  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 827  * before the data page can be written out.  This implements the basic
 828  * WAL rule "write the log before the data".)
 829  *
 830  * NB: this routine feels free to scribble on the XLogRecData structs,
 831  * though not on the data they reference.  This is OK since the XLogRecData
 832  * structs are always just temporaries in the calling code.
 833  */
 834 XLogRecPtr
 835 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 836 {
 837         XLogCtlInsert *Insert = &XLogCtl->Insert;
 838         XLogRecData *rdt;
 839         XLogRecData *rdt_lastnormal;
 840         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 841         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 842         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 843         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 844         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 845         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 846         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 847         XLogRecData hdr_rdt;
 848         pg_crc32        rdata_crc;
 849         uint32          len,
 850                                 write_len;
 851         unsigned        i;
 852         bool            doPageWrites;
 853         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 854         bool            inserted;
 855         uint8           info_orig = info;
 856         static XLogRecord *rechdr;
 857         XLogRecPtr      StartPos;
 858         XLogRecPtr      EndPos;
 859
 860         if (rechdr == NULL)
 861         {
 862                 rechdr = malloc(SizeOfXLogRecord);
 863                 if (rechdr == NULL)
 864                         elog(ERROR, "out of memory");
 865                 MemSet(rechdr, 0, SizeOfXLogRecord);
 866         }
 867
 868         /* cross-check on whether we should be here or not */
 869         if (!XLogInsertAllowed())
 870                 elog(ERROR, "cannot make new WAL entries during recovery");
 871
 872         /* info's high bits are reserved for use by me */
 873         if (info & XLR_INFO_MASK)
 874                 elog(PANIC, "invalid xlog info mask %02X", info);
 875
 876         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 877
 878         /*
 879          * In bootstrap mode, we don't actually log anything but XLOG resources;
 880          * return a phony record pointer.
 881          */
 882         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 883         {
 884                 EndPos = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 885                 return EndPos;
 886         }
 887
 888         /*
 889          * Here we scan the rdata chain, to determine which buffers must be backed
 890          * up.
 891          *
 892          * We may have to loop back to here if a race condition is detected below.
 893          * We could prevent the race by doing all this work while holding an
 894          * insertion lock, but it seems better to avoid doing CRC calculations
 895          * while holding one.
 896          *
 897          * We add entries for backup blocks to the chain, so that they don't need
 898          * any special treatment in the critical section where the chunks are
 899          * copied into the WAL buffers. Those entries have to be unlinked from the
 900          * chain if we have to loop back here.
 901          */
 902 begin:;
 903         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 904         {
 905                 dtbuf[i] = InvalidBuffer;
 906                 dtbuf_bkp[i] = false;
 907         }
 908
 909         /*
 910          * Decide if we need to do full-page writes in this XLOG record: true if
 911          * full_page_writes is on or we have a PITR request for it.  Since we
 912          * don't yet have an insertion lock, fullPageWrites and forcePageWrites
 913          * could change under us, but we'll recheck them once we have a lock.
 914          */
 915         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 916
 917         len = 0;
 918         for (rdt = rdata;;)
 919         {
 920                 if (rdt->buffer == InvalidBuffer)
 921                 {
 922                         /* Simple data, just include it */
 923                         len += rdt->len;
 924                 }
 925                 else
 926                 {
 927                         /* Find info for buffer */
 928                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 929                         {
 930                                 if (rdt->buffer == dtbuf[i])
 931                                 {
 932                                         /* Buffer already referenced by earlier chain item */
 933                                         if (dtbuf_bkp[i])
 934                                         {
 935                                                 rdt->data = NULL;
 936                                                 rdt->len = 0;
 937                                         }
 938                                         else if (rdt->data)
 939                                                 len += rdt->len;
 940                                         break;
 941                                 }
 942                                 if (dtbuf[i] == InvalidBuffer)
 943                                 {
 944                                         /* OK, put it in this slot */
 945                                         dtbuf[i] = rdt->buffer;
 946                                         if (doPageWrites && XLogCheckBuffer(rdt, true,
 947                                                                                    &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 948                                         {
 949                                                 dtbuf_bkp[i] = true;
 950                                                 rdt->data = NULL;
 951                                                 rdt->len = 0;
 952                                         }
 953                                         else if (rdt->data)
 954                                                 len += rdt->len;
 955                                         break;
 956                                 }
 957                         }
 958                         if (i >= XLR_MAX_BKP_BLOCKS)
 959                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 960                                          XLR_MAX_BKP_BLOCKS);
 961                 }
 962                 /* Break out of loop when rdt points to last chain item */
 963                 if (rdt->next == NULL)
 964                         break;
 965                 rdt = rdt->next;
 966         }
 967
 968         /*
 969          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 970          * error checking in ReadRecord.  This means that all callers of
 971          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 972          * make an exception for XLOG SWITCH records because we don't want them to
 973          * ever cross a segment boundary.
 974          */
 975         if (len == 0 && !isLogSwitch)
 976                 elog(PANIC, "invalid xlog record length %u", len);
 977
 978         /*
 979          * Make additional rdata chain entries for the backup blocks, so that we
 980          * don't need to special-case them in the write loop.  This modifies the
 981          * original rdata chain, but we keep a pointer to the last regular entry,
 982          * rdt_lastnormal, so that we can undo this if we have to loop back to the
 983          * beginning.
 984          *
 985          * At the exit of this loop, write_len includes the backup block data.
 986          *
 987          * Also set the appropriate info bits to show which buffers were backed
 988          * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
 989          * value (ignoring InvalidBuffer) appearing in the rdata chain.
 990          */
 991         rdt_lastnormal = rdt;
 992         write_len = len;
 993         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 994         {
 995                 BkpBlock   *bkpb;
 996                 char       *page;
 997
 998                 if (!dtbuf_bkp[i])
 999                         continue;
1000
1001                 info |= XLR_BKP_BLOCK(i);
1002
1003                 bkpb = &(dtbuf_xlg[i]);
1004                 page = (char *) BufferGetBlock(dtbuf[i]);
1005
1006                 rdt->next = &(dtbuf_rdt1[i]);
1007                 rdt = rdt->next;
1008
1009                 rdt->data = (char *) bkpb;
1010                 rdt->len = sizeof(BkpBlock);
1011                 write_len += sizeof(BkpBlock);
1012
1013                 rdt->next = &(dtbuf_rdt2[i]);
1014                 rdt = rdt->next;
1015
1016                 if (bkpb->hole_length == 0)
1017                 {
1018                         rdt->data = page;
1019                         rdt->len = BLCKSZ;
1020                         write_len += BLCKSZ;
1021                         rdt->next = NULL;
1022                 }
1023                 else
1024                 {
1025                         /* must skip the hole */
1026                         rdt->data = page;
1027                         rdt->len = bkpb->hole_offset;
1028                         write_len += bkpb->hole_offset;
1029
1030                         rdt->next = &(dtbuf_rdt3[i]);
1031                         rdt = rdt->next;
1032
1033                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
1034                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
1035                         write_len += rdt->len;
1036                         rdt->next = NULL;
1037                 }
1038         }
1039
1040         /*
1041          * Calculate CRC of the data, including all the backup blocks
1042          *
1043          * Note that the record header isn't added into the CRC initially since we
1044          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
1045          * the whole record in the order: rdata, then backup blocks, then record
1046          * header.
1047          */
1048         INIT_CRC32(rdata_crc);
1049         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
1050                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
1051
1052         /*
1053          * Construct record header (prev-link is filled in later, after reserving
1054          * the space for the record), and make that the first chunk in the chain.
1055          *
1056          * The CRC calculated for the header here doesn't include prev-link,
1057          * because we don't know it yet. It will be added later.
1058          */
1059         rechdr->xl_xid = GetCurrentTransactionIdIfAny();
1060         rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
1061         rechdr->xl_len = len;           /* doesn't include backup blocks */
1062         rechdr->xl_info = info;
1063         rechdr->xl_rmid = rmid;
1064         rechdr->xl_prev = InvalidXLogRecPtr;
1065         COMP_CRC32(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev));
1066
1067         hdr_rdt.next = rdata;
1068         hdr_rdt.data = (char *) rechdr;
1069         hdr_rdt.len = SizeOfXLogRecord;
1070         write_len += SizeOfXLogRecord;
1071
1072         /*----------
1073          *
1074          * We have now done all the preparatory work we can without holding a
1075          * lock or modifying shared state. From here on, inserting the new WAL
1076          * record to the shared WAL buffer cache is a two-step process:
1077          *
1078          * 1. Reserve the right amount of space from the WAL. The current head of
1079          *    reserved space is kept in Insert->CurrBytePos, and is protected by
1080          *    insertpos_lck.
1081          *
1082          * 2. Copy the record to the reserved WAL space. This involves finding the
1083          *    correct WAL buffer containing the reserved space, and copying the
1084          *    record in place. This can be done concurrently in multiple processes.
1085          *
1086          * To keep track of which insertions are still in-progress, each concurrent
1087          * inserter acquires an insertion lock. In addition to just indicating that
1088          * an insertion is in progress, the lock tells others how far the inserter
1089          * has progressed. There is a small fixed number of insertion locks,
1090          * determined by the num_xloginsert_locks GUC. When an inserter crosses a
1091          * page boundary, it updates the value stored in the lock to the how far it
1092          * has inserted, to allow the previous buffer to be flushed.
1093          *
1094          * Holding onto an insertion lock also protects RedoRecPtr and
1095          * fullPageWrites from changing until the insertion is finished.
1096          *
1097          * Step 2 can usually be done completely in parallel. If the required WAL
1098          * page is not initialized yet, you have to grab WALBufMappingLock to
1099          * initialize it, but the WAL writer tries to do that ahead of insertions
1100          * to avoid that from happening in the critical path.
1101          *
1102          *----------
1103          */
1104         START_CRIT_SECTION();
1105         if (isLogSwitch)
1106                 WALInsertLockAcquireExclusive();
1107         else
1108                 WALInsertLockAcquire();
1109
1110         /*
1111          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
1112          * back and recompute everything.  This can only happen just after a
1113          * checkpoint, so it's better to be slow in this case and fast otherwise.
1114          *
1115          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1116          * affect the contents of the XLOG record, so we'll update our local copy
1117          * but not force a recomputation.
1118          */
1119         if (RedoRecPtr != Insert->RedoRecPtr)
1120         {
1121                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1122                 RedoRecPtr = Insert->RedoRecPtr;
1123
1124                 if (doPageWrites)
1125                 {
1126                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1127                         {
1128                                 if (dtbuf[i] == InvalidBuffer)
1129                                         continue;
1130                                 if (dtbuf_bkp[i] == false &&
1131                                         dtbuf_lsn[i] <= RedoRecPtr)
1132                                 {
1133                                         /*
1134                                          * Oops, this buffer now needs to be backed up, but we
1135                                          * didn't think so above.  Start over.
1136                                          */
1137                                         WALInsertLockRelease();
1138                                         END_CRIT_SECTION();
1139                                         rdt_lastnormal->next = NULL;
1140                                         info = info_orig;
1141                                         goto begin;
1142                                 }
1143                         }
1144                 }
1145         }
1146
1147         /*
1148          * Also check to see if fullPageWrites or forcePageWrites was just turned
1149          * on; if we weren't already doing full-page writes then go back and
1150          * recompute. (If it was just turned off, we could recompute the record
1151          * without full pages, but we choose not to bother.)
1152          */
1153         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
1154         {
1155                 /* Oops, must redo it with full-page data. */
1156                 WALInsertLockRelease();
1157                 END_CRIT_SECTION();
1158                 rdt_lastnormal->next = NULL;
1159                 info = info_orig;
1160                 goto begin;
1161         }
1162
1163         /*
1164          * Reserve space for the record in the WAL. This also sets the xl_prev
1165          * pointer.
1166          */
1167         if (isLogSwitch)
1168                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1169         else
1170         {
1171                 ReserveXLogInsertLocation(write_len, &StartPos, &EndPos,
1172                                                                   &rechdr->xl_prev);
1173                 inserted = true;
1174         }
1175
1176         if (inserted)
1177         {
1178                 /*
1179                  * Now that xl_prev has been filled in, finish CRC calculation of the
1180                  * record header.
1181                  */
1182                 COMP_CRC32(rdata_crc, ((char *) &rechdr->xl_prev), sizeof(XLogRecPtr));
1183                 FIN_CRC32(rdata_crc);
1184                 rechdr->xl_crc = rdata_crc;
1185
1186                 /*
1187                  * All the record data, including the header, is now ready to be
1188                  * inserted. Copy the record in the space reserved.
1189                  */
1190                 CopyXLogRecordToWAL(write_len, isLogSwitch, &hdr_rdt, StartPos, EndPos);
1191         }
1192         else
1193         {
1194                 /*
1195                  * This was an xlog-switch record, but the current insert location was
1196                  * already exactly at the beginning of a segment, so there was no need
1197                  * to do anything.
1198                  */
1199         }
1200
1201         /*
1202          * Done! Let others know that we're finished.
1203          */
1204         WALInsertLockRelease();
1205
1206         MarkCurrentTransactionIdLoggedIfAny();
1207
1208         END_CRIT_SECTION();
1209
1210         /*
1211          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1212          */
1213         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1214         {
1215                 /* use volatile pointer to prevent code rearrangement */
1216                 volatile XLogCtlData *xlogctl = XLogCtl;
1217
1218                 SpinLockAcquire(&xlogctl->info_lck);
1219                 /* advance global request to include new block(s) */
1220                 if (xlogctl->LogwrtRqst.Write < EndPos)
1221                         xlogctl->LogwrtRqst.Write = EndPos;
1222                 /* update local result copy while I have the chance */
1223                 LogwrtResult = xlogctl->LogwrtResult;
1224                 SpinLockRelease(&xlogctl->info_lck);
1225         }
1226
1227         /*
1228          * If this was an XLOG_SWITCH record, flush the record and the empty
1229          * padding space that fills the rest of the segment, and perform
1230          * end-of-segment actions (eg, notifying archiver).
1231          */
1232         if (isLogSwitch)
1233         {
1234                 TRACE_POSTGRESQL_XLOG_SWITCH();
1235                 XLogFlush(EndPos);
1236                 /*
1237                  * Even though we reserved the rest of the segment for us, which is
1238                  * reflected in EndPos, we return a pointer to just the end of the
1239                  * xlog-switch record.
1240                  */
1241                 if (inserted)
1242                 {
1243                         EndPos = StartPos + SizeOfXLogRecord;
1244                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1245                         {
1246                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1247                                         EndPos += SizeOfXLogLongPHD;
1248                                 else
1249                                         EndPos += SizeOfXLogShortPHD;
1250                         }
1251                 }
1252         }
1253
1254 #ifdef WAL_DEBUG
1255         if (XLOG_DEBUG)
1256         {
1257                 StringInfoData buf;
1258
1259                 initStringInfo(&buf);
1260                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1261                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1262                 xlog_outrec(&buf, rechdr);
1263                 if (rdata->data != NULL)
1264                 {
1265                         StringInfoData recordbuf;
1266
1267                         /*
1268                          * We have to piece together the WAL record data from the
1269                          * XLogRecData entries, so that we can pass it to the rm_desc
1270                          * function as one contiguous chunk. (but we can leave out any
1271                          * extra entries we created for backup blocks)
1272                          */
1273                         rdt_lastnormal->next = NULL;
1274
1275                         initStringInfo(&recordbuf);
1276                         for (;rdata != NULL; rdata = rdata->next)
1277                                 appendBinaryStringInfo(&recordbuf, rdata->data, rdata->len);
1278
1279                         appendStringInfoString(&buf, " - ");
1280                         RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, recordbuf.data);
1281                         pfree(recordbuf.data);
1282                 }
1283                 elog(LOG, "%s", buf.data);
1284                 pfree(buf.data);
1285         }
1286 #endif
1287
1288         /*
1289          * Update our global variables
1290          */
1291         ProcLastRecPtr = StartPos;
1292         XactLastRecEnd = EndPos;
1293
1294         return EndPos;
1295 }
1296
1297 /*
1298  * Reserves the right amount of space for a record of given size from the WAL.
1299  * *StartPos is set to the beginning of the reserved section, *EndPos to
1300  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1301  * used to set the xl_prev of this record.
1302  *
1303  * This is the performance critical part of XLogInsert that must be serialized
1304  * across backends. The rest can happen mostly in parallel. Try to keep this
1305  * section as short as possible, insertpos_lck can be heavily contended on a
1306  * busy system.
1307  *
1308  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1309  * where we actually copy the record to the reserved space.
1310  */
1311 static void
1312 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1313                                                   XLogRecPtr *PrevPtr)
1314 {
1315         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1316         uint64          startbytepos;
1317         uint64          endbytepos;
1318         uint64          prevbytepos;
1319
1320         size = MAXALIGN(size);
1321
1322         /* All (non xlog-switch) records should contain data. */
1323         Assert(size > SizeOfXLogRecord);
1324
1325         /*
1326          * The duration the spinlock needs to be held is minimized by minimizing
1327          * the calculations that have to be done while holding the lock. The
1328          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1329          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1330          * page headers. The mapping between "usable" byte positions and physical
1331          * positions (XLogRecPtrs) can be done outside the locked region, and
1332          * because the usable byte position doesn't include any headers, reserving
1333          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1334          */
1335         SpinLockAcquire(&Insert->insertpos_lck);
1336
1337         startbytepos = Insert->CurrBytePos;
1338         endbytepos = startbytepos + size;
1339         prevbytepos = Insert->PrevBytePos;
1340         Insert->CurrBytePos = endbytepos;
1341         Insert->PrevBytePos = startbytepos;
1342
1343         SpinLockRelease(&Insert->insertpos_lck);
1344
1345         *StartPos = XLogBytePosToRecPtr(startbytepos);
1346         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1347         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1348
1349         /*
1350          * Check that the conversions between "usable byte positions" and
1351          * XLogRecPtrs work consistently in both directions.
1352          */
1353         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1354         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1355         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1356 }
1357
1358 /*
1359  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1360  *
1361  * A log-switch record is handled slightly differently. The rest of the
1362  * segment will be reserved for this insertion, as indicated by the returned
1363  * *EndPos value. However, if we are already at the beginning of the current
1364  * segment, *StartPos and *EndPos are set to the current location without
1365  * reserving any space, and the function returns false.
1366 */
1367 static bool
1368 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1369 {
1370         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1371         uint64          startbytepos;
1372         uint64          endbytepos;
1373         uint64          prevbytepos;
1374         uint32          size = SizeOfXLogRecord;
1375         XLogRecPtr      ptr;
1376         uint32          segleft;
1377
1378         /*
1379          * These calculations are a bit heavy-weight to be done while holding a
1380          * spinlock, but since we're holding all the WAL insertion locks, there
1381          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1382          * compete for it, but that's not called very frequently.
1383          */
1384         SpinLockAcquire(&Insert->insertpos_lck);
1385
1386         startbytepos = Insert->CurrBytePos;
1387
1388         ptr = XLogBytePosToEndRecPtr(startbytepos);
1389         if (ptr % XLOG_SEG_SIZE == 0)
1390         {
1391                 SpinLockRelease(&Insert->insertpos_lck);
1392                 *EndPos = *StartPos = ptr;
1393                 return false;
1394         }
1395
1396         endbytepos = startbytepos + size;
1397         prevbytepos = Insert->PrevBytePos;
1398
1399         *StartPos = XLogBytePosToRecPtr(startbytepos);
1400         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1401
1402         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1403         if (segleft != XLOG_SEG_SIZE)
1404         {
1405                 /* consume the rest of the segment */
1406                 *EndPos += segleft;
1407                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1408         }
1409         Insert->CurrBytePos = endbytepos;
1410         Insert->PrevBytePos = startbytepos;
1411
1412         SpinLockRelease(&Insert->insertpos_lck);
1413
1414         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1415
1416         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1417         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1418         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1419         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1420
1421         return true;
1422 }
1423
1424 /*
1425  * Subroutine of XLogInsert.  Copies a WAL record to an already-reserved
1426  * area in the WAL.
1427  */
1428 static void
1429 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1430                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1431 {
1432         char       *currpos;
1433         int                     freespace;
1434         int                     written;
1435         XLogRecPtr      CurrPos;
1436         XLogPageHeader pagehdr;
1437
1438         /* The first chunk is the record header */
1439         Assert(rdata->len == SizeOfXLogRecord);
1440
1441         /*
1442          * Get a pointer to the right place in the right WAL buffer to start
1443          * inserting to.
1444          */
1445         CurrPos = StartPos;
1446         currpos = GetXLogBuffer(CurrPos);
1447         freespace = INSERT_FREESPACE(CurrPos);
1448
1449         /*
1450          * there should be enough space for at least the first field (xl_tot_len)
1451          * on this page.
1452          */
1453         Assert(freespace >= sizeof(uint32));
1454
1455         /* Copy record data */
1456         written = 0;
1457         while (rdata != NULL)
1458         {
1459                 char       *rdata_data = rdata->data;
1460                 int                     rdata_len = rdata->len;
1461
1462                 while (rdata_len > freespace)
1463                 {
1464                         /*
1465                          * Write what fits on this page, and continue on the next page.
1466                          */
1467                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1468                         memcpy(currpos, rdata_data, freespace);
1469                         rdata_data += freespace;
1470                         rdata_len -= freespace;
1471                         written += freespace;
1472                         CurrPos += freespace;
1473
1474                         /*
1475                          * Get pointer to beginning of next page, and set the xlp_rem_len
1476                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1477                          *
1478                          * It's safe to set the contrecord flag and xlp_rem_len without a
1479                          * lock on the page. All the other flags were already set when the
1480                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1481                          * only backend that needs to set the contrecord flag.
1482                          */
1483                         currpos = GetXLogBuffer(CurrPos);
1484                         pagehdr = (XLogPageHeader) currpos;
1485                         pagehdr->xlp_rem_len = write_len - written;
1486                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1487
1488                         /* skip over the page header */
1489                         if (CurrPos % XLogSegSize == 0)
1490                         {
1491                                 CurrPos += SizeOfXLogLongPHD;
1492                                 currpos += SizeOfXLogLongPHD;
1493                         }
1494                         else
1495                         {
1496                                 CurrPos += SizeOfXLogShortPHD;
1497                                 currpos += SizeOfXLogShortPHD;
1498                         }
1499                         freespace = INSERT_FREESPACE(CurrPos);
1500                 }
1501
1502                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1503                 memcpy(currpos, rdata_data, rdata_len);
1504                 currpos += rdata_len;
1505                 CurrPos += rdata_len;
1506                 freespace -= rdata_len;
1507                 written += rdata_len;
1508
1509                 rdata = rdata->next;
1510         }
1511         Assert(written == write_len);
1512
1513         /* Align the end position, so that the next record starts aligned */
1514         CurrPos = MAXALIGN64(CurrPos);
1515
1516         /*
1517          * If this was an xlog-switch, it's not enough to write the switch record,
1518          * we also have to consume all the remaining space in the WAL segment.
1519          * We have already reserved it for us, but we still need to make sure it's
1520          * allocated and zeroed in the WAL buffers so that when the caller (or
1521          * someone else) does XLogWrite(), it can really write out all the zeros.
1522          */
1523         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1524         {
1525                 /* An xlog-switch record doesn't contain any data besides the header */
1526                 Assert(write_len == SizeOfXLogRecord);
1527
1528                 /*
1529                  * We do this one page at a time, to make sure we don't deadlock
1530                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1531                  */
1532                 Assert(EndPos % XLogSegSize == 0);
1533
1534                 /* Use up all the remaining space on the first page */
1535                 CurrPos += freespace;
1536
1537                 while (CurrPos < EndPos)
1538                 {
1539                         /* initialize the next page (if not initialized already) */
1540                         WALInsertLockUpdateInsertingAt(CurrPos);
1541                         AdvanceXLInsertBuffer(CurrPos, false);
1542                         CurrPos += XLOG_BLCKSZ;
1543                 }
1544         }
1545
1546         if (CurrPos != EndPos)
1547                 elog(PANIC, "space reserved for WAL record does not match what was written");
1548 }
1549
1550 /*
1551  * Acquire a WAL insertion lock, for inserting to WAL.
1552  */
1553 static void
1554 WALInsertLockAcquire(void)
1555 {
1556         bool            immed;
1557
1558         /*
1559          * It doesn't matter which of the WAL insertion locks we acquire, so try
1560          * the one we used last time.  If the system isn't particularly busy,
1561          * it's a good bet that it's still available, and it's good to have some
1562          * affinity to a particular lock so that you don't unnecessarily bounce
1563          * cache lines between processes when there's no contention.
1564          *
1565          * If this is the first time through in this backend, pick a lock
1566          * (semi-)randomly.  This allows the locks to be used evenly if you have
1567          * a lot of very short connections.
1568          */
1569         static int      lockToTry = -1;
1570
1571         if (lockToTry == -1)
1572                 lockToTry = MyProc->pgprocno % num_xloginsert_locks;
1573         MyLockNo = lockToTry;
1574
1575         /*
1576          * The insertingAt value is initially set to 0, as we don't know our
1577          * insert location yet.
1578          */
1579         immed = LWLockAcquireWithVar(&WALInsertLocks[MyLockNo].l.lock,
1580                                                                  &WALInsertLocks[MyLockNo].l.insertingAt,
1581                                                                  0);
1582         if (!immed)
1583         {
1584                 /*
1585                  * If we couldn't get the lock immediately, try another lock next
1586                  * time.  On a system with more insertion locks than concurrent
1587                  * inserters, this causes all the inserters to eventually migrate
1588                  * to a lock that no-one else is using.  On a system with more
1589                  * inserters than locks, it still helps to distribute the inserters
1590                  * evenly across the locks.
1591                  */
1592                 lockToTry = (lockToTry + 1) % num_xloginsert_locks;
1593         }
1594 }
1595
1596 /*
1597  * Acquire all WAL insertion locks, to prevent other backends from inserting
1598  * to WAL.
1599  */
1600 static void
1601 WALInsertLockAcquireExclusive(void)
1602 {
1603         int                     i;
1604
1605         /*
1606          * When holding all the locks, we only update the last lock's insertingAt
1607          * indicator.  The others are set to 0xFFFFFFFFFFFFFFFF, which is higher
1608          * than any real XLogRecPtr value, to make sure that no-one blocks
1609          * waiting on those.
1610          */
1611         for (i = 0; i < num_xloginsert_locks - 1; i++)
1612         {
1613                 LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
1614                                                          &WALInsertLocks[i].l.insertingAt,
1615                                                          UINT64CONST(0xFFFFFFFFFFFFFFFF));
1616         }
1617         LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
1618                                                  &WALInsertLocks[i].l.insertingAt,
1619                                                  0);
1620
1621         holdingAllLocks = true;
1622 }
1623
1624 /*
1625  * Release our insertion lock (or locks, if we're holding them all).
1626  */
1627 static void
1628 WALInsertLockRelease(void)
1629 {
1630         if (holdingAllLocks)
1631         {
1632                 int                     i;
1633
1634                 for (i = 0; i < num_xloginsert_locks; i++)
1635                         LWLockRelease(&WALInsertLocks[i].l.lock);
1636
1637                 holdingAllLocks = false;
1638         }
1639         else
1640         {
1641                 LWLockRelease(&WALInsertLocks[MyLockNo].l.lock);
1642         }
1643 }
1644
1645 /*
1646  * Update our insertingAt value, to let others know that we've finished
1647  * inserting up to that point.
1648  */
1649 static void
1650 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1651 {
1652         if (holdingAllLocks)
1653         {
1654                 /*
1655                  * We use the last lock to mark our actual position, see comments in
1656                  * WALInsertLockAcquireExclusive.
1657                  */
1658                 LWLockUpdateVar(&WALInsertLocks[num_xloginsert_locks - 1].l.lock,
1659                                                 &WALInsertLocks[num_xloginsert_locks - 1].l.insertingAt,
1660                                                 insertingAt);
1661         }
1662         else
1663                 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1664                                                 &WALInsertLocks[MyLockNo].l.insertingAt,
1665                                                 insertingAt);
1666 }
1667
1668 /*
1669  * Wait for any WAL insertions < upto to finish.
1670  *
1671  * Returns the location of the oldest insertion that is still in-progress.
1672  * Any WAL prior to that point has been fully copied into WAL buffers, and
1673  * can be flushed out to disk. Because this waits for any insertions older
1674  * than 'upto' to finish, the return value is always >= 'upto'.
1675  *
1676  * Note: When you are about to write out WAL, you must call this function
1677  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1678  * need to wait for an insertion to finish (or at least advance to next
1679  * uninitialized page), and the inserter might need to evict an old WAL buffer
1680  * to make room for a new one, which in turn requires WALWriteLock.
1681  */
1682 static XLogRecPtr
1683 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1684 {
1685         uint64          bytepos;
1686         XLogRecPtr      reservedUpto;
1687         XLogRecPtr      finishedUpto;
1688         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1689         int                     i;
1690
1691         if (MyProc == NULL)
1692                 elog(PANIC, "cannot wait without a PGPROC structure");
1693
1694         /* Read the current insert position */
1695         SpinLockAcquire(&Insert->insertpos_lck);
1696         bytepos = Insert->CurrBytePos;
1697         SpinLockRelease(&Insert->insertpos_lck);
1698         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1699
1700         /*
1701          * No-one should request to flush a piece of WAL that hasn't even been
1702          * reserved yet. However, it can happen if there is a block with a bogus
1703          * LSN on disk, for example. XLogFlush checks for that situation and
1704          * complains, but only after the flush. Here we just assume that to mean
1705          * that all WAL that has been reserved needs to be finished. In this
1706          * corner-case, the return value can be smaller than 'upto' argument.
1707          */
1708         if (upto > reservedUpto)
1709         {
1710                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1711                          (uint32) (upto >> 32), (uint32) upto,
1712                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1713                 upto = reservedUpto;
1714         }
1715
1716         /*
1717          * Loop through all the locks, sleeping on any in-progress insert older
1718          * than 'upto'.
1719          *
1720          * finishedUpto is our return value, indicating the point upto which
1721          * all the WAL insertions have been finished. Initialize it to the head
1722          * of reserved WAL, and as we iterate through the insertion locks, back it
1723          * out for any insertion that's still in progress.
1724          */
1725         finishedUpto = reservedUpto;
1726         for (i = 0; i < num_xloginsert_locks; i++)
1727         {
1728                 XLogRecPtr insertingat = InvalidXLogRecPtr;
1729                 do
1730                 {
1731                         /*
1732                          * See if this insertion is in progress. LWLockWait will wait for
1733                          * the lock to be released, or for the 'value' to be set by a
1734                          * LWLockUpdateVar call.  When a lock is initially acquired, its
1735                          * value is 0 (InvalidXLogRecPtr), which means that we don't know
1736                          * where it's inserting yet.  We will have to wait for it.  If
1737                          * it's a small insertion, the record will most likely fit on the
1738                          * same page and the inserter will release the lock without ever
1739                          * calling LWLockUpdateVar.  But if it has to sleep, it will
1740                          * advertise the insertion point with LWLockUpdateVar before
1741                          * sleeping.
1742                          */
1743                         if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1744                                                                  &WALInsertLocks[i].l.insertingAt,
1745                                                                  insertingat, &insertingat))
1746                         {
1747                                 /* the lock was free, so no insertion in progress */
1748                                 insertingat = InvalidXLogRecPtr;
1749                                 break;
1750                         }
1751
1752                         /*
1753                          * This insertion is still in progress. Have to wait, unless the
1754                          * inserter has proceeded past 'upto'.
1755                          */
1756                 } while (insertingat < upto);
1757
1758                 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1759                         finishedUpto = insertingat;
1760         }
1761         return finishedUpto;
1762 }
1763
1764 /*
1765  * Get a pointer to the right location in the WAL buffer containing the
1766  * given XLogRecPtr.
1767  *
1768  * If the page is not initialized yet, it is initialized. That might require
1769  * evicting an old dirty buffer from the buffer cache, which means I/O.
1770  *
1771  * The caller must ensure that the page containing the requested location
1772  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1773  * hold onto a WAL insertion lock with the insertingAt position set to
1774  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1775  * to evict an old page from the buffer. (This means that once you call
1776  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1777  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1778  * later, because older buffers might be recycled already)
1779  */
1780 static char *
1781 GetXLogBuffer(XLogRecPtr ptr)
1782 {
1783         int                     idx;
1784         XLogRecPtr      endptr;
1785         static uint64 cachedPage = 0;
1786         static char *cachedPos = NULL;
1787         XLogRecPtr      expectedEndPtr;
1788
1789         /*
1790          * Fast path for the common case that we need to access again the same
1791          * page as last time.
1792          */
1793         if (ptr / XLOG_BLCKSZ == cachedPage)
1794         {
1795                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1796                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1797                 return cachedPos + ptr % XLOG_BLCKSZ;
1798         }
1799
1800         /*
1801          * The XLog buffer cache is organized so that a page is always loaded
1802          * to a particular buffer.  That way we can easily calculate the buffer
1803          * a given page must be loaded into, from the XLogRecPtr alone.
1804          */
1805         idx = XLogRecPtrToBufIdx(ptr);
1806
1807         /*
1808          * See what page is loaded in the buffer at the moment. It could be the
1809          * page we're looking for, or something older. It can't be anything newer
1810          * - that would imply the page we're looking for has already been written
1811          * out to disk and evicted, and the caller is responsible for making sure
1812          * that doesn't happen.
1813          *
1814          * However, we don't hold a lock while we read the value. If someone has
1815          * just initialized the page, it's possible that we get a "torn read" of
1816          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1817          * that case we will see a bogus value. That's ok, we'll grab the mapping
1818          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1819          * the page we're looking for. But it means that when we do this unlocked
1820          * read, we might see a value that appears to be ahead of the page we're
1821          * looking for. Don't PANIC on that, until we've verified the value while
1822          * holding the lock.
1823          */
1824         expectedEndPtr = ptr;
1825         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1826
1827         endptr = XLogCtl->xlblocks[idx];
1828         if (expectedEndPtr != endptr)
1829         {
1830                 /*
1831                  * Let others know that we're finished inserting the record up
1832                  * to the page boundary.
1833                  */
1834                 WALInsertLockUpdateInsertingAt(expectedEndPtr - XLOG_BLCKSZ);
1835
1836                 AdvanceXLInsertBuffer(ptr, false);
1837                 endptr = XLogCtl->xlblocks[idx];
1838
1839                 if (expectedEndPtr != endptr)
1840                         elog(PANIC, "could not find WAL buffer for %X/%X",
1841                                  (uint32) (ptr >> 32) , (uint32) ptr);
1842         }
1843         else
1844         {
1845                 /*
1846                  * Make sure the initialization of the page is visible to us, and
1847                  * won't arrive later to overwrite the WAL data we write on the page.
1848                  */
1849                 pg_memory_barrier();
1850         }
1851
1852         /*
1853          * Found the buffer holding this page. Return a pointer to the right
1854          * offset within the page.
1855          */
1856         cachedPage = ptr / XLOG_BLCKSZ;
1857         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1858
1859         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1860         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1861
1862         return cachedPos + ptr % XLOG_BLCKSZ;
1863 }
1864
1865 /*
1866  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1867  * is the position starting from the beginning of WAL, excluding all WAL
1868  * page headers.
1869  */
1870 static XLogRecPtr
1871 XLogBytePosToRecPtr(uint64 bytepos)
1872 {
1873         uint64          fullsegs;
1874         uint64          fullpages;
1875         uint64          bytesleft;
1876         uint32          seg_offset;
1877         XLogRecPtr      result;
1878
1879         fullsegs = bytepos / UsableBytesInSegment;
1880         bytesleft = bytepos % UsableBytesInSegment;
1881
1882         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1883         {
1884                 /* fits on first page of segment */
1885                 seg_offset = bytesleft + SizeOfXLogLongPHD;
1886         }
1887         else
1888         {
1889                 /* account for the first page on segment with long header */
1890                 seg_offset = XLOG_BLCKSZ;
1891                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1892
1893                 fullpages = bytesleft / UsableBytesInPage;
1894                 bytesleft = bytesleft % UsableBytesInPage;
1895
1896                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1897         }
1898
1899         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1900
1901         return result;
1902 }
1903
1904 /*
1905  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1906  * returns a pointer to the beginning of the page (ie. before page header),
1907  * not to where the first xlog record on that page would go to. This is used
1908  * when converting a pointer to the end of a record.
1909  */
1910 static XLogRecPtr
1911 XLogBytePosToEndRecPtr(uint64 bytepos)
1912 {
1913         uint64          fullsegs;
1914         uint64          fullpages;
1915         uint64          bytesleft;
1916         uint32          seg_offset;
1917         XLogRecPtr      result;
1918
1919         fullsegs = bytepos / UsableBytesInSegment;
1920         bytesleft = bytepos % UsableBytesInSegment;
1921
1922         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1923         {
1924                 /* fits on first page of segment */
1925                 if (bytesleft == 0)
1926                         seg_offset = 0;
1927                 else
1928                         seg_offset = bytesleft + SizeOfXLogLongPHD;
1929         }
1930         else
1931         {
1932                 /* account for the first page on segment with long header */
1933                 seg_offset = XLOG_BLCKSZ;
1934                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1935
1936                 fullpages = bytesleft / UsableBytesInPage;
1937                 bytesleft = bytesleft % UsableBytesInPage;
1938
1939                 if (bytesleft == 0)
1940                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1941                 else
1942                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1943         }
1944
1945         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1946
1947         return result;
1948 }
1949
1950 /*
1951  * Convert an XLogRecPtr to a "usable byte position".
1952  */
1953 static uint64
1954 XLogRecPtrToBytePos(XLogRecPtr ptr)
1955 {
1956         uint64          fullsegs;
1957         uint32          fullpages;
1958         uint32          offset;
1959         uint64          result;
1960
1961         XLByteToSeg(ptr, fullsegs);
1962
1963         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
1964         offset = ptr % XLOG_BLCKSZ;
1965
1966         if (fullpages == 0)
1967         {
1968                 result = fullsegs * UsableBytesInSegment;
1969                 if (offset > 0)
1970                 {
1971                         Assert(offset >= SizeOfXLogLongPHD);
1972                         result += offset - SizeOfXLogLongPHD;
1973                 }
1974         }
1975         else
1976         {
1977                 result = fullsegs * UsableBytesInSegment +
1978                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) +  /* account for first page */
1979                         (fullpages - 1) * UsableBytesInPage; /* full pages */
1980                 if (offset > 0)
1981                 {
1982                         Assert(offset >= SizeOfXLogShortPHD);
1983                         result += offset - SizeOfXLogShortPHD;
1984                 }
1985         }
1986
1987         return result;
1988 }
1989
1990 /*
1991  * Determine whether the buffer referenced has to be backed up.
1992  *
1993  * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites
1994  * could change later, so the result should be used for optimization purposes
1995  * only.
1996  */
1997 bool
1998 XLogCheckBufferNeedsBackup(Buffer buffer)
1999 {
2000         bool            doPageWrites;
2001         Page            page;
2002
2003         page = BufferGetPage(buffer);
2004
2005         doPageWrites = XLogCtl->Insert.fullPageWrites || XLogCtl->Insert.forcePageWrites;
2006
2007         if (doPageWrites && PageGetLSN(page) <= RedoRecPtr)
2008                 return true;                    /* buffer requires backup */
2009
2010         return false;                           /* buffer does not need to be backed up */
2011 }
2012
2013 /*
2014  * Determine whether the buffer referenced by an XLogRecData item has to
2015  * be backed up, and if so fill a BkpBlock struct for it.  In any case
2016  * save the buffer's LSN at *lsn.
2017  */
2018 static bool
2019 XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
2020                                 XLogRecPtr *lsn, BkpBlock *bkpb)
2021 {
2022         Page            page;
2023
2024         page = BufferGetPage(rdata->buffer);
2025
2026         /*
2027          * We assume page LSN is first data on *every* page that can be passed to
2028          * XLogInsert, whether it has the standard page layout or not. We don't
2029          * need to take the buffer header lock for PageGetLSN if we hold an
2030          * exclusive lock on the page and/or the relation.
2031          */
2032         if (holdsExclusiveLock)
2033                 *lsn = PageGetLSN(page);
2034         else
2035                 *lsn = BufferGetLSNAtomic(rdata->buffer);
2036
2037         if (*lsn <= RedoRecPtr)
2038         {
2039                 /*
2040                  * The page needs to be backed up, so set up *bkpb
2041                  */
2042                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
2043
2044                 if (rdata->buffer_std)
2045                 {
2046                         /* Assume we can omit data between pd_lower and pd_upper */
2047                         uint16          lower = ((PageHeader) page)->pd_lower;
2048                         uint16          upper = ((PageHeader) page)->pd_upper;
2049
2050                         if (lower >= SizeOfPageHeaderData &&
2051                                 upper > lower &&
2052                                 upper <= BLCKSZ)
2053                         {
2054                                 bkpb->hole_offset = lower;
2055                                 bkpb->hole_length = upper - lower;
2056                         }
2057                         else
2058                         {
2059                                 /* No "hole" to compress out */
2060                                 bkpb->hole_offset = 0;
2061                                 bkpb->hole_length = 0;
2062                         }
2063                 }
2064                 else
2065                 {
2066                         /* Not a standard page header, don't try to eliminate "hole" */
2067                         bkpb->hole_offset = 0;
2068                         bkpb->hole_length = 0;
2069                 }
2070
2071                 return true;                    /* buffer requires backup */
2072         }
2073
2074         return false;                           /* buffer does not need to be backed up */
2075 }
2076
2077 /*
2078  * Initialize XLOG buffers, writing out old buffers if they still contain
2079  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2080  * true, initialize as many pages as we can without having to write out
2081  * unwritten data. Any new pages are initialized to zeros, with pages headers
2082  * initialized properly.
2083  */
2084 static void
2085 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2086 {
2087         XLogCtlInsert *Insert = &XLogCtl->Insert;
2088         int                     nextidx;
2089         XLogRecPtr      OldPageRqstPtr;
2090         XLogwrtRqst WriteRqst;
2091         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2092         XLogRecPtr      NewPageBeginPtr;
2093         XLogPageHeader NewPage;
2094         int                     npages = 0;
2095
2096         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2097
2098         /*
2099          * Now that we have the lock, check if someone initialized the page
2100          * already.
2101          */
2102         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2103         {
2104                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2105
2106                 /*
2107                  * Get ending-offset of the buffer page we need to replace (this may
2108                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2109                  * already written out.
2110                  */
2111                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2112                 if (LogwrtResult.Write < OldPageRqstPtr)
2113                 {
2114                         /*
2115                          * Nope, got work to do. If we just want to pre-initialize as much
2116                          * as we can without flushing, give up now.
2117                          */
2118                         if (opportunistic)
2119                                 break;
2120
2121                         /* Before waiting, get info_lck and update LogwrtResult */
2122                         {
2123                                 /* use volatile pointer to prevent code rearrangement */
2124                                 volatile XLogCtlData *xlogctl = XLogCtl;
2125
2126                                 SpinLockAcquire(&xlogctl->info_lck);
2127                                 if (xlogctl->LogwrtRqst.Write < OldPageRqstPtr)
2128                                         xlogctl->LogwrtRqst.Write = OldPageRqstPtr;
2129                                 LogwrtResult = xlogctl->LogwrtResult;
2130                                 SpinLockRelease(&xlogctl->info_lck);
2131                         }
2132
2133                         /*
2134                          * Now that we have an up-to-date LogwrtResult value, see if we
2135                          * still need to write it or if someone else already did.
2136                          */
2137                         if (LogwrtResult.Write < OldPageRqstPtr)
2138                         {
2139                                 /*
2140                                  * Must acquire write lock. Release WALBufMappingLock first,
2141                                  * to make sure that all insertions that we need to wait for
2142                                  * can finish (up to this same position). Otherwise we risk
2143                                  * deadlock.
2144                                  */
2145                                 LWLockRelease(WALBufMappingLock);
2146
2147                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2148
2149                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2150
2151                                 LogwrtResult = XLogCtl->LogwrtResult;
2152                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2153                                 {
2154                                         /* OK, someone wrote it already */
2155                                         LWLockRelease(WALWriteLock);
2156                                 }
2157                                 else
2158                                 {
2159                                         /* Have to write it ourselves */
2160                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2161                                         WriteRqst.Write = OldPageRqstPtr;
2162                                         WriteRqst.Flush = 0;
2163                                         XLogWrite(WriteRqst, false);
2164                                         LWLockRelease(WALWriteLock);
2165                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2166                                 }
2167                                 /* Re-acquire WALBufMappingLock and retry */
2168                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2169                                 continue;
2170                         }
2171                 }
2172
2173                 /*
2174                  * Now the next buffer slot is free and we can set it up to be the next
2175                  * output page.
2176                  */
2177                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2178                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2179
2180                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2181
2182                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2183
2184                 /*
2185                  * Be sure to re-zero the buffer so that bytes beyond what we've
2186                  * written will look like zeroes and not valid XLOG records...
2187                  */
2188                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2189
2190                 /*
2191                  * Fill the new page's header
2192                  */
2193                 NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
2194
2195                 /* NewPage->xlp_info = 0; */    /* done by memset */
2196                 NewPage   ->xlp_tli = ThisTimeLineID;
2197                 NewPage   ->xlp_pageaddr = NewPageBeginPtr;
2198                 /* NewPage->xlp_rem_len = 0; */         /* done by memset */
2199
2200                 /*
2201                  * If online backup is not in progress, mark the header to indicate
2202                  * that* WAL records beginning in this page have removable backup
2203                  * blocks.  This allows the WAL archiver to know whether it is safe to
2204                  * compress archived WAL data by transforming full-block records into
2205                  * the non-full-block format.  It is sufficient to record this at the
2206                  * page level because we force a page switch (in fact a segment switch)
2207                  * when starting a backup, so the flag will be off before any records
2208                  * can be written during the backup.  At the end of a backup, the last
2209                  * page will be marked as all unsafe when perhaps only part is unsafe,
2210                  * but at worst the archiver would miss the opportunity to compress a
2211                  * few records.
2212                  */
2213                 if (!Insert->forcePageWrites)
2214                         NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
2215
2216                 /*
2217                  * If first page of an XLOG segment file, make it a long header.
2218                  */
2219                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2220                 {
2221                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2222
2223                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2224                         NewLongPage->xlp_seg_size = XLogSegSize;
2225                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2226                         NewPage   ->xlp_info |= XLP_LONG_HEADER;
2227                 }
2228
2229                 /*
2230                  * Make sure the initialization of the page becomes visible to others
2231                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2232                  * holding a lock.
2233                  */
2234                 pg_write_barrier();
2235
2236                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2237
2238                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2239
2240                 npages++;
2241         }
2242         LWLockRelease(WALBufMappingLock);
2243
2244 #ifdef WAL_DEBUG
2245         if (npages > 0)
2246         {
2247                 elog(DEBUG1, "initialized %d pages, upto %X/%X",
2248                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2249         }
2250 #endif
2251 }
2252
2253 /*
2254  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2255  *
2256  * new_segno indicates a log file that has just been filled up (or read
2257  * during recovery). We measure the distance from RedoRecPtr to new_segno
2258  * and see if that exceeds CheckPointSegments.
2259  *
2260  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2261  */
2262 static bool
2263 XLogCheckpointNeeded(XLogSegNo new_segno)
2264 {
2265         XLogSegNo       old_segno;
2266
2267         XLByteToSeg(RedoRecPtr, old_segno);
2268
2269         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2270                 return true;
2271         return false;
2272 }
2273
2274 /*
2275  * Write and/or fsync the log at least as far as WriteRqst indicates.
2276  *
2277  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2278  * may stop at any convenient boundary (such as a cache or logfile boundary).
2279  * This option allows us to avoid uselessly issuing multiple writes when a
2280  * single one would do.
2281  *
2282  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2283  * must be called before grabbing the lock, to make sure the data is ready to
2284  * write.
2285  */
2286 static void
2287 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2288 {
2289         bool            ispartialpage;
2290         bool            last_iteration;
2291         bool            finishing_seg;
2292         bool            use_existent;
2293         int                     curridx;
2294         int                     npages;
2295         int                     startidx;
2296         uint32          startoffset;
2297
2298         /* We should always be inside a critical section here */
2299         Assert(CritSectionCount > 0);
2300
2301         /*
2302          * Update local LogwrtResult (caller probably did this already, but...)
2303          */
2304         LogwrtResult = XLogCtl->LogwrtResult;
2305
2306         /*
2307          * Since successive pages in the xlog cache are consecutively allocated,
2308          * we can usually gather multiple pages together and issue just one
2309          * write() call.  npages is the number of pages we have determined can be
2310          * written together; startidx is the cache block index of the first one,
2311          * and startoffset is the file offset at which it should go. The latter
2312          * two variables are only valid when npages > 0, but we must initialize
2313          * all of them to keep the compiler quiet.
2314          */
2315         npages = 0;
2316         startidx = 0;
2317         startoffset = 0;
2318
2319         /*
2320          * Within the loop, curridx is the cache block index of the page to
2321          * consider writing.  Begin at the buffer containing the next unwritten
2322          * page, or last partially written page.
2323          */
2324         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2325
2326         while (LogwrtResult.Write < WriteRqst.Write)
2327         {
2328                 /*
2329                  * Make sure we're not ahead of the insert process.  This could happen
2330                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2331                  * last page that's been initialized by AdvanceXLInsertBuffer.
2332                  */
2333                 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2334                 if (LogwrtResult.Write >= EndPtr)
2335                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2336                                  (uint32) (LogwrtResult.Write >> 32),
2337                                  (uint32) LogwrtResult.Write,
2338                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2339
2340                 /* Advance LogwrtResult.Write to end of current buffer page */
2341                 LogwrtResult.Write = EndPtr;
2342                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2343
2344                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2345                 {
2346                         /*
2347                          * Switch to new logfile segment.  We cannot have any pending
2348                          * pages here (since we dump what we have at segment end).
2349                          */
2350                         Assert(npages == 0);
2351                         if (openLogFile >= 0)
2352                                 XLogFileClose();
2353                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2354
2355                         /* create/use new log file */
2356                         use_existent = true;
2357                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2358                         openLogOff = 0;
2359                 }
2360
2361                 /* Make sure we have the current logfile open */
2362                 if (openLogFile < 0)
2363                 {
2364                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2365                         openLogFile = XLogFileOpen(openLogSegNo);
2366                         openLogOff = 0;
2367                 }
2368
2369                 /* Add current page to the set of pending pages-to-dump */
2370                 if (npages == 0)
2371                 {
2372                         /* first of group */
2373                         startidx = curridx;
2374                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2375                 }
2376                 npages++;
2377
2378                 /*
2379                  * Dump the set if this will be the last loop iteration, or if we are
2380                  * at the last page of the cache area (since the next page won't be
2381                  * contiguous in memory), or if we are at the end of the logfile
2382                  * segment.
2383                  */
2384                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2385
2386                 finishing_seg = !ispartialpage &&
2387                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2388
2389                 if (last_iteration ||
2390                         curridx == XLogCtl->XLogCacheBlck ||
2391                         finishing_seg)
2392                 {
2393                         char       *from;
2394                         Size            nbytes;
2395                         Size            nleft;
2396                         int                     written;
2397
2398                         /* Need to seek in the file? */
2399                         if (openLogOff != startoffset)
2400                         {
2401                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2402                                         ereport(PANIC,
2403                                                         (errcode_for_file_access(),
2404                                          errmsg("could not seek in log file %s to offset %u: %m",
2405                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2406                                                         startoffset)));
2407                                 openLogOff = startoffset;
2408                         }
2409
2410                         /* OK to write the page(s) */
2411                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2412                         nbytes = npages * (Size) XLOG_BLCKSZ;
2413                         nleft = nbytes;
2414                         do
2415                         {
2416                                 errno = 0;
2417                                 written  = write(openLogFile, from, nleft);
2418                                 if (written <= 0)
2419                                 {
2420                                         if (errno == EINTR)
2421                                                 continue;
2422                                         ereport(PANIC,
2423                                                         (errcode_for_file_access(),
2424                                                          errmsg("could not write to log file %s "
2425                                                                         "at offset %u, length %zu: %m",
2426                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2427                                                                         openLogOff, nbytes)));
2428                                 }
2429                                 nleft -= written;
2430                                 from += written;
2431                         } while (nleft > 0);
2432
2433                         /* Update state for write */
2434                         openLogOff += nbytes;
2435                         npages = 0;
2436
2437                         /*
2438                          * If we just wrote the whole last page of a logfile segment,
2439                          * fsync the segment immediately.  This avoids having to go back
2440                          * and re-open prior segments when an fsync request comes along
2441                          * later. Doing it here ensures that one and only one backend will
2442                          * perform this fsync.
2443                          *
2444                          * This is also the right place to notify the Archiver that the
2445                          * segment is ready to copy to archival storage, and to update the
2446                          * timer for archive_timeout, and to signal for a checkpoint if
2447                          * too many logfile segments have been used since the last
2448                          * checkpoint.
2449                          */
2450                         if (finishing_seg)
2451                         {
2452                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2453
2454                                 /* signal that we need to wakeup walsenders later */
2455                                 WalSndWakeupRequest();
2456
2457                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2458
2459                                 if (XLogArchivingActive())
2460                                         XLogArchiveNotifySeg(openLogSegNo);
2461
2462                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2463
2464                                 /*
2465                                  * Request a checkpoint if we've consumed too much xlog since
2466                                  * the last one.  For speed, we first check using the local
2467                                  * copy of RedoRecPtr, which might be out of date; if it looks
2468                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2469                                  * recheck.
2470                                  */
2471                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2472                                 {
2473                                         (void) GetRedoRecPtr();
2474                                         if (XLogCheckpointNeeded(openLogSegNo))
2475                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2476                                 }
2477                         }
2478                 }
2479
2480                 if (ispartialpage)
2481                 {
2482                         /* Only asked to write a partial page */
2483                         LogwrtResult.Write = WriteRqst.Write;
2484                         break;
2485                 }
2486                 curridx = NextBufIdx(curridx);
2487
2488                 /* If flexible, break out of loop as soon as we wrote something */
2489                 if (flexible && npages == 0)
2490                         break;
2491         }
2492
2493         Assert(npages == 0);
2494
2495         /*
2496          * If asked to flush, do so
2497          */
2498         if (LogwrtResult.Flush < WriteRqst.Flush &&
2499                 LogwrtResult.Flush < LogwrtResult.Write)
2500
2501         {
2502                 /*
2503                  * Could get here without iterating above loop, in which case we might
2504                  * have no open file or the wrong one.  However, we do not need to
2505                  * fsync more than one file.
2506                  */
2507                 if (sync_method != SYNC_METHOD_OPEN &&
2508                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2509                 {
2510                         if (openLogFile >= 0 &&
2511                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2512                                 XLogFileClose();
2513                         if (openLogFile < 0)
2514                         {
2515                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2516                                 openLogFile = XLogFileOpen(openLogSegNo);
2517                                 openLogOff = 0;
2518                         }
2519
2520                         issue_xlog_fsync(openLogFile, openLogSegNo);
2521                 }
2522
2523                 /* signal that we need to wakeup walsenders later */
2524                 WalSndWakeupRequest();
2525
2526                 LogwrtResult.Flush = LogwrtResult.Write;
2527         }
2528
2529         /*
2530          * Update shared-memory status
2531          *
2532          * We make sure that the shared 'request' values do not fall behind the
2533          * 'result' values.  This is not absolutely essential, but it saves some
2534          * code in a couple of places.
2535          */
2536         {
2537                 /* use volatile pointer to prevent code rearrangement */
2538                 volatile XLogCtlData *xlogctl = XLogCtl;
2539
2540                 SpinLockAcquire(&xlogctl->info_lck);
2541                 xlogctl->LogwrtResult = LogwrtResult;
2542                 if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
2543                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
2544                 if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
2545                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
2546                 SpinLockRelease(&xlogctl->info_lck);
2547         }
2548 }
2549
2550 /*
2551  * Record the LSN for an asynchronous transaction commit/abort
2552  * and nudge the WALWriter if there is work for it to do.
2553  * (This should not be called for synchronous commits.)
2554  */
2555 void
2556 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2557 {
2558         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2559         bool            sleeping;
2560
2561         /* use volatile pointer to prevent code rearrangement */
2562         volatile XLogCtlData *xlogctl = XLogCtl;
2563
2564         SpinLockAcquire(&xlogctl->info_lck);
2565         LogwrtResult = xlogctl->LogwrtResult;
2566         sleeping = xlogctl->WalWriterSleeping;
2567         if (xlogctl->asyncXactLSN < asyncXactLSN)
2568                 xlogctl->asyncXactLSN = asyncXactLSN;
2569         SpinLockRelease(&xlogctl->info_lck);
2570
2571         /*
2572          * If the WALWriter is sleeping, we should kick it to make it come out of
2573          * low-power mode.      Otherwise, determine whether there's a full page of
2574          * WAL available to write.
2575          */
2576         if (!sleeping)
2577         {
2578                 /* back off to last completed page boundary */
2579                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2580
2581                 /* if we have already flushed that far, we're done */
2582                 if (WriteRqstPtr <= LogwrtResult.Flush)
2583                         return;
2584         }
2585
2586         /*
2587          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2588          * to come out of low-power mode so that this async commit will reach disk
2589          * within the expected amount of time.
2590          */
2591         if (ProcGlobal->walwriterLatch)
2592                 SetLatch(ProcGlobal->walwriterLatch);
2593 }
2594
2595 /*
2596  * Record the LSN up to which we can remove WAL because it's not required by
2597  * any replication slot.
2598  */
2599 void
2600 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2601 {
2602         /* use volatile pointer to prevent code rearrangement */
2603         volatile XLogCtlData *xlogctl = XLogCtl;
2604
2605         SpinLockAcquire(&xlogctl->info_lck);
2606         xlogctl->replicationSlotMinLSN = lsn;
2607         SpinLockRelease(&xlogctl->info_lck);
2608 }
2609
2610
2611 /*
2612  * Return the oldest LSN we must retain to satisfy the needs of some
2613  * replication slot.
2614  */
2615 static XLogRecPtr
2616 XLogGetReplicationSlotMinimumLSN(void)
2617 {
2618         /* use volatile pointer to prevent code rearrangement */
2619         volatile XLogCtlData *xlogctl = XLogCtl;
2620         XLogRecPtr              retval;
2621         SpinLockAcquire(&xlogctl->info_lck);
2622         retval = xlogctl->replicationSlotMinLSN;
2623         SpinLockRelease(&xlogctl->info_lck);
2624
2625         return retval;
2626 }
2627
2628 /*
2629  * Advance minRecoveryPoint in control file.
2630  *
2631  * If we crash during recovery, we must reach this point again before the
2632  * database is consistent.
2633  *
2634  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2635  * is only updated if it's not already greater than or equal to 'lsn'.
2636  */
2637 static void
2638 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2639 {
2640         /* Quick check using our local copy of the variable */
2641         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2642                 return;
2643
2644         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2645
2646         /* update local copy */
2647         minRecoveryPoint = ControlFile->minRecoveryPoint;
2648         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2649
2650         /*
2651          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2652          * i.e., we're doing crash recovery.  We never modify the control file's
2653          * value in that case, so we can short-circuit future checks here too.
2654          */
2655         if (minRecoveryPoint == 0)
2656                 updateMinRecoveryPoint = false;
2657         else if (force || minRecoveryPoint < lsn)
2658         {
2659                 /* use volatile pointer to prevent code rearrangement */
2660                 volatile XLogCtlData *xlogctl = XLogCtl;
2661                 XLogRecPtr      newMinRecoveryPoint;
2662                 TimeLineID      newMinRecoveryPointTLI;
2663
2664                 /*
2665                  * To avoid having to update the control file too often, we update it
2666                  * all the way to the last record being replayed, even though 'lsn'
2667                  * would suffice for correctness.  This also allows the 'force' case
2668                  * to not need a valid 'lsn' value.
2669                  *
2670                  * Another important reason for doing it this way is that the passed
2671                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2672                  * the caller got it from a corrupted heap page.  Accepting such a
2673                  * value as the min recovery point would prevent us from coming up at
2674                  * all.  Instead, we just log a warning and continue with recovery.
2675                  * (See also the comments about corrupt LSNs in XLogFlush.)
2676                  */
2677                 SpinLockAcquire(&xlogctl->info_lck);
2678                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
2679                 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
2680                 SpinLockRelease(&xlogctl->info_lck);
2681
2682                 if (!force && newMinRecoveryPoint < lsn)
2683                         elog(WARNING,
2684                            "xlog min recovery request %X/%X is past current point %X/%X",
2685                                  (uint32) (lsn >> 32), (uint32) lsn,
2686                                  (uint32) (newMinRecoveryPoint >> 32),
2687                                  (uint32) newMinRecoveryPoint);
2688
2689                 /* update control file */
2690                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2691                 {
2692                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2693                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2694                         UpdateControlFile();
2695                         minRecoveryPoint = newMinRecoveryPoint;
2696                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2697
2698                         ereport(DEBUG2,
2699                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
2700                                                 (uint32) (minRecoveryPoint >> 32),
2701                                                 (uint32) minRecoveryPoint,
2702                                                 newMinRecoveryPointTLI)));
2703                 }
2704         }
2705         LWLockRelease(ControlFileLock);
2706 }
2707
2708 /*
2709  * Ensure that all XLOG data through the given position is flushed to disk.
2710  *
2711  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2712  * already held, and we try to avoid acquiring it if possible.
2713  */
2714 void
2715 XLogFlush(XLogRecPtr record)
2716 {
2717         XLogRecPtr      WriteRqstPtr;
2718         XLogwrtRqst WriteRqst;
2719
2720         /*
2721          * During REDO, we are reading not writing WAL.  Therefore, instead of
2722          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2723          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2724          * to act this way too, and because when it tries to write the
2725          * end-of-recovery checkpoint, it should indeed flush.
2726          */
2727         if (!XLogInsertAllowed())
2728         {
2729                 UpdateMinRecoveryPoint(record, false);
2730                 return;
2731         }
2732
2733         /* Quick exit if already known flushed */
2734         if (record <= LogwrtResult.Flush)
2735                 return;
2736
2737 #ifdef WAL_DEBUG
2738         if (XLOG_DEBUG)
2739                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2740                          (uint32) (record >> 32), (uint32) record,
2741                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2742                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2743 #endif
2744
2745         START_CRIT_SECTION();
2746
2747         /*
2748          * Since fsync is usually a horribly expensive operation, we try to
2749          * piggyback as much data as we can on each fsync: if we see any more data
2750          * entered into the xlog buffer, we'll write and fsync that too, so that
2751          * the final value of LogwrtResult.Flush is as large as possible. This
2752          * gives us some chance of avoiding another fsync immediately after.
2753          */
2754
2755         /* initialize to given target; may increase below */
2756         WriteRqstPtr = record;
2757
2758         /*
2759          * Now wait until we get the write lock, or someone else does the flush
2760          * for us.
2761          */
2762         for (;;)
2763         {
2764                 /* use volatile pointer to prevent code rearrangement */
2765                 volatile XLogCtlData *xlogctl = XLogCtl;
2766                 XLogRecPtr      insertpos;
2767
2768                 /* read LogwrtResult and update local state */
2769                 SpinLockAcquire(&xlogctl->info_lck);
2770                 if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
2771                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2772                 LogwrtResult = xlogctl->LogwrtResult;
2773                 SpinLockRelease(&xlogctl->info_lck);
2774
2775                 /* done already? */
2776                 if (record <= LogwrtResult.Flush)
2777                         break;
2778
2779                 /*
2780                  * Before actually performing the write, wait for all in-flight
2781                  * insertions to the pages we're about to write to finish.
2782                  */
2783                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2784
2785                 /*
2786                  * Try to get the write lock. If we can't get it immediately, wait
2787                  * until it's released, and recheck if we still need to do the flush
2788                  * or if the backend that held the lock did it for us already. This
2789                  * helps to maintain a good rate of group committing when the system
2790                  * is bottlenecked by the speed of fsyncing.
2791                  */
2792                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2793                 {
2794                         /*
2795                          * The lock is now free, but we didn't acquire it yet. Before we
2796                          * do, loop back to check if someone else flushed the record for
2797                          * us already.
2798                          */
2799                         continue;
2800                 }
2801
2802                 /* Got the lock; recheck whether request is satisfied */
2803                 LogwrtResult = XLogCtl->LogwrtResult;
2804                 if (record <= LogwrtResult.Flush)
2805                 {
2806                         LWLockRelease(WALWriteLock);
2807                         break;
2808                 }
2809
2810                 /*
2811                  * Sleep before flush! By adding a delay here, we may give further
2812                  * backends the opportunity to join the backlog of group commit
2813                  * followers; this can significantly improve transaction throughput,
2814                  * at the risk of increasing transaction latency.
2815                  *
2816                  * We do not sleep if enableFsync is not turned on, nor if there are
2817                  * fewer than CommitSiblings other backends with active transactions.
2818                  */
2819                 if (CommitDelay > 0 && enableFsync &&
2820                         MinimumActiveBackends(CommitSiblings))
2821                 {
2822                         pg_usleep(CommitDelay);
2823
2824                         /*
2825                          * Re-check how far we can now flush the WAL. It's generally not
2826                          * safe to call WaitXLogInsetionsToFinish while holding
2827                          * WALWriteLock, because an in-progress insertion might need to
2828                          * also grab WALWriteLock to make progress. But we know that all
2829                          * the insertions up to insertpos have already finished, because
2830                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
2831                          * We're only calling it again to allow insertpos to be moved
2832                          * further forward, not to actually wait for anyone.
2833                          */
2834                         insertpos = WaitXLogInsertionsToFinish(insertpos);
2835                 }
2836
2837                 /* try to write/flush later additions to XLOG as well */
2838                 WriteRqst.Write = insertpos;
2839                 WriteRqst.Flush = insertpos;
2840
2841                 XLogWrite(WriteRqst, false);
2842
2843                 LWLockRelease(WALWriteLock);
2844                 /* done */
2845                 break;
2846         }
2847
2848         END_CRIT_SECTION();
2849
2850         /* wake up walsenders now that we've released heavily contended locks */
2851         WalSndWakeupProcessRequests();
2852
2853         /*
2854          * If we still haven't flushed to the request point then we have a
2855          * problem; most likely, the requested flush point is past end of XLOG.
2856          * This has been seen to occur when a disk page has a corrupted LSN.
2857          *
2858          * Formerly we treated this as a PANIC condition, but that hurts the
2859          * system's robustness rather than helping it: we do not want to take down
2860          * the whole system due to corruption on one data page.  In particular, if
2861          * the bad page is encountered again during recovery then we would be
2862          * unable to restart the database at all!  (This scenario actually
2863          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2864          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2865          * the only time we can reach here during recovery is while flushing the
2866          * end-of-recovery checkpoint record, and we don't expect that to have a
2867          * bad LSN.
2868          *
2869          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2870          * since xact.c calls this routine inside a critical section.  However,
2871          * calls from bufmgr.c are not within critical sections and so we will not
2872          * force a restart for a bad LSN on a data page.
2873          */
2874         if (LogwrtResult.Flush < record)
2875                 elog(ERROR,
2876                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2877                          (uint32) (record >> 32), (uint32) record,
2878                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2879 }
2880
2881 /*
2882  * Flush xlog, but without specifying exactly where to flush to.
2883  *
2884  * We normally flush only completed blocks; but if there is nothing to do on
2885  * that basis, we check for unflushed async commits in the current incomplete
2886  * block, and flush through the latest one of those.  Thus, if async commits
2887  * are not being used, we will flush complete blocks only.      We can guarantee
2888  * that async commits reach disk after at most three cycles; normally only
2889  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
2890  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
2891  * difference only with very high load or long wal_writer_delay, but imposes
2892  * one extra cycle for the worst case for async commits.)
2893  *
2894  * This routine is invoked periodically by the background walwriter process.
2895  *
2896  * Returns TRUE if we flushed anything.
2897  */
2898 bool
2899 XLogBackgroundFlush(void)
2900 {
2901         XLogRecPtr      WriteRqstPtr;
2902         bool            flexible = true;
2903         bool            wrote_something = false;
2904
2905         /* XLOG doesn't need flushing during recovery */
2906         if (RecoveryInProgress())
2907                 return false;
2908
2909         /* read LogwrtResult and update local state */
2910         {
2911                 /* use volatile pointer to prevent code rearrangement */
2912                 volatile XLogCtlData *xlogctl = XLogCtl;
2913
2914                 SpinLockAcquire(&xlogctl->info_lck);
2915                 LogwrtResult = xlogctl->LogwrtResult;
2916                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2917                 SpinLockRelease(&xlogctl->info_lck);
2918         }
2919
2920         /* back off to last completed page boundary */
2921         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2922
2923         /* if we have already flushed that far, consider async commit records */
2924         if (WriteRqstPtr <= LogwrtResult.Flush)
2925         {
2926                 /* use volatile pointer to prevent code rearrangement */
2927                 volatile XLogCtlData *xlogctl = XLogCtl;
2928
2929                 SpinLockAcquire(&xlogctl->info_lck);
2930                 WriteRqstPtr = xlogctl->asyncXactLSN;
2931                 SpinLockRelease(&xlogctl->info_lck);
2932                 flexible = false;               /* ensure it all gets written */
2933         }
2934
2935         /*
2936          * If already known flushed, we're done. Just need to check if we are
2937          * holding an open file handle to a logfile that's no longer in use,
2938          * preventing the file from being deleted.
2939          */
2940         if (WriteRqstPtr <= LogwrtResult.Flush)
2941         {
2942                 if (openLogFile >= 0)
2943                 {
2944                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2945                         {
2946                                 XLogFileClose();
2947                         }
2948                 }
2949                 return false;
2950         }
2951
2952 #ifdef WAL_DEBUG
2953         if (XLOG_DEBUG)
2954                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2955                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
2956                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2957                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2958 #endif
2959
2960         START_CRIT_SECTION();
2961
2962         /* now wait for any in-progress insertions to finish and get write lock */
2963         WaitXLogInsertionsToFinish(WriteRqstPtr);
2964         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2965         LogwrtResult = XLogCtl->LogwrtResult;
2966         if (WriteRqstPtr > LogwrtResult.Flush)
2967         {
2968                 XLogwrtRqst WriteRqst;
2969
2970                 WriteRqst.Write = WriteRqstPtr;
2971                 WriteRqst.Flush = WriteRqstPtr;
2972                 XLogWrite(WriteRqst, flexible);
2973                 wrote_something = true;
2974         }
2975         LWLockRelease(WALWriteLock);
2976
2977         END_CRIT_SECTION();
2978
2979         /* wake up walsenders now that we've released heavily contended locks */
2980         WalSndWakeupProcessRequests();
2981
2982         /*
2983          * Great, done. To take some work off the critical path, try to initialize
2984          * as many of the no-longer-needed WAL buffers for future use as we can.
2985          */
2986         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
2987
2988         return wrote_something;
2989 }
2990
2991 /*
2992  * Test whether XLOG data has been flushed up to (at least) the given position.
2993  *
2994  * Returns true if a flush is still needed.  (It may be that someone else
2995  * is already in process of flushing that far, however.)
2996  */
2997 bool
2998 XLogNeedsFlush(XLogRecPtr record)
2999 {
3000         /*
3001          * During recovery, we don't flush WAL but update minRecoveryPoint
3002          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3003          * would need to be updated.
3004          */
3005         if (RecoveryInProgress())
3006         {
3007                 /* Quick exit if already known updated */
3008                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3009                         return false;
3010
3011                 /*
3012                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3013                  * just return a conservative guess.
3014                  */
3015                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3016                         return true;
3017                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3018                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3019                 LWLockRelease(ControlFileLock);
3020
3021                 /*
3022                  * An invalid minRecoveryPoint means that we need to recover all the
3023                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3024                  * file's value in that case, so we can short-circuit future checks
3025                  * here too.
3026                  */
3027                 if (minRecoveryPoint == 0)
3028                         updateMinRecoveryPoint = false;
3029
3030                 /* check again */
3031                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3032                         return false;
3033                 else
3034                         return true;
3035         }
3036
3037         /* Quick exit if already known flushed */
3038         if (record <= LogwrtResult.Flush)
3039                 return false;
3040
3041         /* read LogwrtResult and update local state */
3042         {
3043                 /* use volatile pointer to prevent code rearrangement */
3044                 volatile XLogCtlData *xlogctl = XLogCtl;
3045
3046                 SpinLockAcquire(&xlogctl->info_lck);
3047                 LogwrtResult = xlogctl->LogwrtResult;
3048                 SpinLockRelease(&xlogctl->info_lck);
3049         }
3050
3051         /* check again */
3052         if (record <= LogwrtResult.Flush)
3053                 return false;
3054
3055         return true;
3056 }
3057
3058 /*
3059  * Create a new XLOG file segment, or open a pre-existing one.
3060  *
3061  * log, seg: identify segment to be created/opened.
3062  *
3063  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3064  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
3065  * file was used.
3066  *
3067  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3068  * place.  This should be TRUE except during bootstrap log creation.  The
3069  * caller must *not* hold the lock at call.
3070  *
3071  * Returns FD of opened file.
3072  *
3073  * Note: errors here are ERROR not PANIC because we might or might not be
3074  * inside a critical section (eg, during checkpoint there is no reason to
3075  * take down the system on failure).  They will promote to PANIC if we are
3076  * in a critical section.
3077  */
3078 int
3079 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3080 {
3081         char            path[MAXPGPATH];
3082         char            tmppath[MAXPGPATH];
3083         char       *zbuffer;
3084         XLogSegNo       installed_segno;
3085         int                     max_advance;
3086         int                     fd;
3087         int                     nbytes;
3088
3089         XLogFilePath(path, ThisTimeLineID, logsegno);
3090
3091         /*
3092          * Try to use existent file (checkpoint maker may have created it already)
3093          */
3094         if (*use_existent)
3095         {
3096                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3097                                                    S_IRUSR | S_IWUSR);
3098                 if (fd < 0)
3099                 {
3100                         if (errno != ENOENT)
3101                                 ereport(ERROR,
3102                                                 (errcode_for_file_access(),
3103                                                  errmsg("could not open file \"%s\": %m", path)));
3104                 }
3105                 else
3106                         return fd;
3107         }
3108
3109         /*
3110          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3111          * another process is doing the same thing.  If so, we will end up
3112          * pre-creating an extra log segment.  That seems OK, and better than
3113          * holding the lock throughout this lengthy process.
3114          */
3115         elog(DEBUG2, "creating and filling new WAL file");
3116
3117         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3118
3119         unlink(tmppath);
3120
3121         /*
3122          * Allocate a buffer full of zeros. This is done before opening the file
3123          * so that we don't leak the file descriptor if palloc fails.
3124          *
3125          * Note: palloc zbuffer, instead of just using a local char array, to
3126          * ensure it is reasonably well-aligned; this may save a few cycles
3127          * transferring data to the kernel.
3128          */
3129         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
3130
3131         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3132         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3133                                            S_IRUSR | S_IWUSR);
3134         if (fd < 0)
3135                 ereport(ERROR,
3136                                 (errcode_for_file_access(),
3137                                  errmsg("could not create file \"%s\": %m", tmppath)));
3138
3139         /*
3140          * Zero-fill the file.  We have to do this the hard way to ensure that all
3141          * the file space has really been allocated --- on platforms that allow
3142          * "holes" in files, just seeking to the end doesn't allocate intermediate
3143          * space.  This way, we know that we have all the space and (after the
3144          * fsync below) that all the indirect blocks are down on disk.  Therefore,
3145          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3146          * log file.
3147          */
3148         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3149         {
3150                 errno = 0;
3151                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3152                 {
3153                         int                     save_errno = errno;
3154
3155                         /*
3156                          * If we fail to make the file, delete it to release disk space
3157                          */
3158                         unlink(tmppath);
3159
3160                         close(fd);
3161
3162                         /* if write didn't set errno, assume problem is no disk space */
3163                         errno = save_errno ? save_errno : ENOSPC;
3164
3165                         ereport(ERROR,
3166                                         (errcode_for_file_access(),
3167                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3168                 }
3169         }
3170         pfree(zbuffer);
3171
3172         if (pg_fsync(fd) != 0)
3173         {
3174                 close(fd);
3175                 ereport(ERROR,
3176                                 (errcode_for_file_access(),
3177                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3178         }
3179
3180         if (close(fd))
3181                 ereport(ERROR,
3182                                 (errcode_for_file_access(),
3183                                  errmsg("could not close file \"%s\": %m", tmppath)));
3184
3185         /*
3186          * Now move the segment into place with its final name.
3187          *
3188          * If caller didn't want to use a pre-existing file, get rid of any
3189          * pre-existing file.  Otherwise, cope with possibility that someone else
3190          * has created the file while we were filling ours: if so, use ours to
3191          * pre-create a future log segment.
3192          */
3193         installed_segno = logsegno;
3194         max_advance = XLOGfileslop;
3195         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3196                                                                 *use_existent, &max_advance,
3197                                                                 use_lock))
3198         {
3199                 /*
3200                  * No need for any more future segments, or InstallXLogFileSegment()
3201                  * failed to rename the file into place. If the rename failed, opening
3202                  * the file below will fail.
3203                  */
3204                 unlink(tmppath);
3205         }
3206
3207         /* Set flag to tell caller there was no existent file */
3208         *use_existent = false;
3209
3210         /* Now open original target segment (might not be file I just made) */
3211         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3212                                            S_IRUSR | S_IWUSR);
3213         if (fd < 0)
3214                 ereport(ERROR,
3215                                 (errcode_for_file_access(),
3216                                  errmsg("could not open file \"%s\": %m", path)));
3217
3218         elog(DEBUG2, "done creating and filling new WAL file");
3219
3220         return fd;
3221 }
3222
3223 /*
3224  * Create a new XLOG file segment by copying a pre-existing one.
3225  *
3226  * destsegno: identify segment to be created.
3227  *
3228  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
3229  *              a different timeline)
3230  *
3231  * Currently this is only used during recovery, and so there are no locking
3232  * considerations.      But we should be just as tense as XLogFileInit to avoid
3233  * emplacing a bogus file.
3234  */
3235 static void
3236 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
3237 {
3238         char            path[MAXPGPATH];
3239         char            tmppath[MAXPGPATH];
3240         char            buffer[XLOG_BLCKSZ];
3241         int                     srcfd;
3242         int                     fd;
3243         int                     nbytes;
3244
3245         /*
3246          * Open the source file
3247          */
3248         XLogFilePath(path, srcTLI, srcsegno);
3249         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3250         if (srcfd < 0)
3251                 ereport(ERROR,
3252                                 (errcode_for_file_access(),
3253                                  errmsg("could not open file \"%s\": %m", path)));
3254
3255         /*
3256          * Copy into a temp file name.
3257          */
3258         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3259
3260         unlink(tmppath);
3261
3262         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3263         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3264                                                    S_IRUSR | S_IWUSR);
3265         if (fd < 0)
3266                 ereport(ERROR,
3267                                 (errcode_for_file_access(),
3268                                  errmsg("could not create file \"%s\": %m", tmppath)));
3269
3270         /*
3271          * Do the data copying.
3272          */
3273         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3274         {
3275                 errno = 0;
3276                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3277                 {
3278                         if (errno != 0)
3279                                 ereport(ERROR,
3280                                                 (errcode_for_file_access(),
3281                                                  errmsg("could not read file \"%s\": %m", path)));
3282                         else
3283                                 ereport(ERROR,
3284                                                 (errmsg("not enough data in file \"%s\"", path)));
3285                 }
3286                 errno = 0;
3287                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3288                 {
3289                         int                     save_errno = errno;
3290
3291                         /*
3292                          * If we fail to make the file, delete it to release disk space
3293                          */
3294                         unlink(tmppath);
3295                         /* if write didn't set errno, assume problem is no disk space */
3296                         errno = save_errno ? save_errno : ENOSPC;
3297
3298                         ereport(ERROR,
3299                                         (errcode_for_file_access(),
3300                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3301                 }
3302         }
3303
3304         if (pg_fsync(fd) != 0)
3305                 ereport(ERROR,
3306                                 (errcode_for_file_access(),
3307                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3308
3309         if (CloseTransientFile(fd))
3310                 ereport(ERROR,
3311                                 (errcode_for_file_access(),
3312                                  errmsg("could not close file \"%s\": %m", tmppath)));
3313
3314         CloseTransientFile(srcfd);
3315
3316         /*
3317          * Now move the segment into place with its final name.
3318          */
3319         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
3320                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3321 }
3322
3323 /*
3324  * Install a new XLOG segment file as a current or future log segment.
3325  *
3326  * This is used both to install a newly-created segment (which has a temp
3327  * filename while it's being created) and to recycle an old segment.
3328  *
3329  * *segno: identify segment to install as (or first possible target).
3330  * When find_free is TRUE, this is modified on return to indicate the
3331  * actual installation location or last segment searched.
3332  *
3333  * tmppath: initial name of file to install.  It will be renamed into place.
3334  *
3335  * find_free: if TRUE, install the new segment at the first empty segno
3336  * number at or after the passed numbers.  If FALSE, install the new segment
3337  * exactly where specified, deleting any existing segment file there.
3338  *
3339  * *max_advance: maximum number of segno slots to advance past the starting
3340  * point.  Fail if no free slot is found in this range.  On return, reduced
3341  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
3342  * when find_free is FALSE.)
3343  *
3344  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3345  * place.  This should be TRUE except during bootstrap log creation.  The
3346  * caller must *not* hold the lock at call.
3347  *
3348  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3349  * max_advance limit was exceeded, or an error occurred while renaming the
3350  * file into place.
3351  */
3352 static bool
3353 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3354                                            bool find_free, int *max_advance,
3355                                            bool use_lock)
3356 {
3357         char            path[MAXPGPATH];
3358         struct stat stat_buf;
3359
3360         XLogFilePath(path, ThisTimeLineID, *segno);
3361
3362         /*
3363          * We want to be sure that only one process does this at a time.
3364          */
3365         if (use_lock)
3366                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3367
3368         if (!find_free)
3369         {
3370                 /* Force installation: get rid of any pre-existing segment file */
3371                 unlink(path);
3372         }
3373         else
3374         {
3375                 /* Find a free slot to put it in */
3376                 while (stat(path, &stat_buf) == 0)
3377                 {
3378                         if (*max_advance <= 0)
3379                         {
3380                                 /* Failed to find a free slot within specified range */
3381                                 if (use_lock)
3382                                         LWLockRelease(ControlFileLock);
3383                                 return false;
3384                         }
3385                         (*segno)++;
3386                         (*max_advance)--;
3387                         XLogFilePath(path, ThisTimeLineID, *segno);
3388                 }
3389         }
3390
3391         /*
3392          * Prefer link() to rename() here just to be really sure that we don't
3393          * overwrite an existing logfile.  However, there shouldn't be one, so
3394          * rename() is an acceptable substitute except for the truly paranoid.
3395          */
3396 #if HAVE_WORKING_LINK
3397         if (link(tmppath, path) < 0)
3398         {
3399                 if (use_lock)
3400                         LWLockRelease(ControlFileLock);
3401                 ereport(LOG,
3402                                 (errcode_for_file_access(),
3403                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
3404                                                 tmppath, path)));
3405                 return false;
3406         }
3407         unlink(tmppath);
3408 #else
3409         if (rename(tmppath, path) < 0)
3410         {
3411                 if (use_lock)
3412                         LWLockRelease(ControlFileLock);
3413                 ereport(LOG,
3414                                 (errcode_for_file_access(),
3415                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
3416                                                 tmppath, path)));
3417                 return false;
3418         }
3419 #endif
3420
3421         if (use_lock)
3422                 LWLockRelease(ControlFileLock);
3423
3424         return true;
3425 }
3426
3427 /*
3428  * Open a pre-existing logfile segment for writing.
3429  */
3430 int
3431 XLogFileOpen(XLogSegNo segno)
3432 {
3433         char            path[MAXPGPATH];
3434         int                     fd;
3435
3436         XLogFilePath(path, ThisTimeLineID, segno);
3437
3438         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3439                                            S_IRUSR | S_IWUSR);
3440         if (fd < 0)
3441                 ereport(PANIC,
3442                                 (errcode_for_file_access(),
3443                                  errmsg("could not open transaction log file \"%s\": %m", path)));
3444
3445         return fd;
3446 }
3447
3448 /*
3449  * Open a logfile segment for reading (during recovery).
3450  *
3451  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3452  * Otherwise, it's assumed to be already available in pg_xlog.
3453  */
3454 static int
3455 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3456                          int source, bool notfoundOk)
3457 {
3458         char            xlogfname[MAXFNAMELEN];
3459         char            activitymsg[MAXFNAMELEN + 16];
3460         char            path[MAXPGPATH];
3461         int                     fd;
3462
3463         XLogFileName(xlogfname, tli, segno);
3464
3465         switch (source)
3466         {
3467                 case XLOG_FROM_ARCHIVE:
3468                         /* Report recovery progress in PS display */
3469                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3470                                          xlogfname);
3471                         set_ps_display(activitymsg, false);
3472
3473                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3474                                                                                                           "RECOVERYXLOG",
3475                                                                                                           XLogSegSize,
3476                                                                                                           InRedo);
3477                         if (!restoredFromArchive)
3478                                 return -1;
3479                         break;
3480
3481                 case XLOG_FROM_PG_XLOG:
3482                 case XLOG_FROM_STREAM:
3483                         XLogFilePath(path, tli, segno);
3484                         restoredFromArchive = false;
3485                         break;
3486
3487                 default:
3488                         elog(ERROR, "invalid XLogFileRead source %d", source);
3489         }
3490
3491         /*
3492          * If the segment was fetched from archival storage, replace the existing
3493          * xlog segment (if any) with the archival version.
3494          */
3495         if (source == XLOG_FROM_ARCHIVE)
3496         {
3497                 KeepFileRestoredFromArchive(path, xlogfname);
3498
3499                 /*
3500                  * Set path to point at the new file in pg_xlog.
3501                  */
3502                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3503         }
3504
3505         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3506         if (fd >= 0)
3507         {
3508                 /* Success! */
3509                 curFileTLI = tli;
3510
3511                 /* Report recovery progress in PS display */
3512                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3513                                  xlogfname);
3514                 set_ps_display(activitymsg, false);
3515
3516                 /* Track source of data in assorted state variables */
3517                 readSource = source;
3518                 XLogReceiptSource = source;
3519                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3520                 if (source != XLOG_FROM_STREAM)
3521                         XLogReceiptTime = GetCurrentTimestamp();
3522
3523                 return fd;
3524         }
3525         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3526                 ereport(PANIC,
3527                                 (errcode_for_file_access(),
3528                                  errmsg("could not open file \"%s\": %m", path)));
3529         return -1;
3530 }
3531
3532 /*
3533  * Open a logfile segment for reading (during recovery).
3534  *
3535  * This version searches for the segment with any TLI listed in expectedTLEs.
3536  */
3537 static int
3538 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3539 {
3540         char            path[MAXPGPATH];
3541         ListCell   *cell;
3542         int                     fd;
3543         List       *tles;
3544
3545         /*
3546          * Loop looking for a suitable timeline ID: we might need to read any of
3547          * the timelines listed in expectedTLEs.
3548          *
3549          * We expect curFileTLI on entry to be the TLI of the preceding file in
3550          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3551          * to go backwards; this prevents us from picking up the wrong file when a
3552          * parent timeline extends to higher segment numbers than the child we
3553          * want to read.
3554          *
3555          * If we haven't read the timeline history file yet, read it now, so that
3556          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3557          * however, unless we actually find a valid segment.  That way if there is
3558          * neither a timeline history file nor a WAL segment in the archive, and
3559          * streaming replication is set up, we'll read the timeline history file
3560          * streamed from the master when we start streaming, instead of recovering
3561          * with a dummy history generated here.
3562          */
3563         if (expectedTLEs)
3564                 tles = expectedTLEs;
3565         else
3566                 tles = readTimeLineHistory(recoveryTargetTLI);
3567
3568         foreach(cell, tles)
3569         {
3570                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3571
3572                 if (tli < curFileTLI)
3573                         break;                          /* don't bother looking at too-old TLIs */
3574
3575                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3576                 {
3577                         fd = XLogFileRead(segno, emode, tli,
3578                                                           XLOG_FROM_ARCHIVE, true);
3579                         if (fd != -1)
3580                         {
3581                                 elog(DEBUG1, "got WAL segment from archive");
3582                                 if (!expectedTLEs)
3583                                         expectedTLEs = tles;
3584                                 return fd;
3585                         }
3586                 }
3587
3588                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3589                 {
3590                         fd = XLogFileRead(segno, emode, tli,
3591                                                           XLOG_FROM_PG_XLOG, true);
3592                         if (fd != -1)
3593                         {
3594                                 if (!expectedTLEs)
3595                                         expectedTLEs = tles;
3596                                 return fd;
3597                         }
3598                 }
3599         }
3600
3601         /* Couldn't find it.  For simplicity, complain about front timeline */
3602         XLogFilePath(path, recoveryTargetTLI, segno);
3603         errno = ENOENT;
3604         ereport(emode,
3605                         (errcode_for_file_access(),
3606                          errmsg("could not open file \"%s\": %m", path)));
3607         return -1;
3608 }
3609
3610 /*
3611  * Close the current logfile segment for writing.
3612  */
3613 static void
3614 XLogFileClose(void)
3615 {
3616         Assert(openLogFile >= 0);
3617
3618         /*
3619          * WAL segment files will not be re-read in normal operation, so we advise
3620          * the OS to release any cached pages.  But do not do so if WAL archiving
3621          * or streaming is active, because archiver and walsender process could
3622          * use the cache to read the WAL segment.
3623          */
3624 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3625         if (!XLogIsNeeded())
3626                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3627 #endif
3628
3629         if (close(openLogFile))
3630                 ereport(PANIC,
3631                                 (errcode_for_file_access(),
3632                                  errmsg("could not close log file %s: %m",
3633                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3634         openLogFile = -1;
3635 }
3636
3637 /*
3638  * Preallocate log files beyond the specified log endpoint.
3639  *
3640  * XXX this is currently extremely conservative, since it forces only one
3641  * future log segment to exist, and even that only if we are 75% done with
3642  * the current one.  This is only appropriate for very low-WAL-volume systems.
3643  * High-volume systems will be OK once they've built up a sufficient set of
3644  * recycled log segments, but the startup transient is likely to include
3645  * a lot of segment creations by foreground processes, which is not so good.
3646  */
3647 static void
3648 PreallocXlogFiles(XLogRecPtr endptr)
3649 {
3650         XLogSegNo       _logSegNo;
3651         int                     lf;
3652         bool            use_existent;
3653
3654         XLByteToPrevSeg(endptr, _logSegNo);
3655         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3656         {
3657                 _logSegNo++;
3658                 use_existent = true;
3659                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3660                 close(lf);
3661                 if (!use_existent)
3662                         CheckpointStats.ckpt_segs_added++;
3663         }
3664 }
3665
3666 /*
3667  * Throws an error if the given log segment has already been removed or
3668  * recycled. The caller should only pass a segment that it knows to have
3669  * existed while the server has been running, as this function always
3670  * succeeds if no WAL segments have been removed since startup.
3671  * 'tli' is only used in the error message.
3672  */
3673 void
3674 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3675 {
3676         /* use volatile pointer to prevent code rearrangement */
3677         volatile XLogCtlData *xlogctl = XLogCtl;
3678         XLogSegNo       lastRemovedSegNo;
3679
3680         SpinLockAcquire(&xlogctl->info_lck);
3681         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
3682         SpinLockRelease(&xlogctl->info_lck);
3683
3684         if (segno <= lastRemovedSegNo)
3685         {
3686                 char            filename[MAXFNAMELEN];
3687
3688                 XLogFileName(filename, tli, segno);
3689                 ereport(ERROR,
3690                                 (errcode_for_file_access(),
3691                                  errmsg("requested WAL segment %s has already been removed",
3692                                                 filename)));
3693         }
3694 }
3695
3696 /*
3697  * Return the last WAL segment removed, or 0 if no segment has been removed
3698  * since startup.
3699  *
3700  * NB: the result can be out of date arbitrarily fast, the caller has to deal
3701  * with that.
3702  */
3703 XLogSegNo
3704 XLogGetLastRemovedSegno(void)
3705 {
3706         /* use volatile pointer to prevent code rearrangement */
3707         volatile XLogCtlData *xlogctl = XLogCtl;
3708         XLogSegNo       lastRemovedSegNo;
3709
3710         SpinLockAcquire(&xlogctl->info_lck);
3711         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
3712         SpinLockRelease(&xlogctl->info_lck);
3713
3714         return lastRemovedSegNo;
3715 }
3716
3717 /*
3718  * Update the last removed segno pointer in shared memory, to reflect
3719  * that the given XLOG file has been removed.
3720  */
3721 static void
3722 UpdateLastRemovedPtr(char *filename)
3723 {
3724         /* use volatile pointer to prevent code rearrangement */
3725         volatile XLogCtlData *xlogctl = XLogCtl;
3726         uint32          tli;
3727         XLogSegNo       segno;
3728
3729         XLogFromFileName(filename, &tli, &segno);
3730
3731         SpinLockAcquire(&xlogctl->info_lck);
3732         if (segno > xlogctl->lastRemovedSegNo)
3733                 xlogctl->lastRemovedSegNo = segno;
3734         SpinLockRelease(&xlogctl->info_lck);
3735 }
3736
3737 /*
3738  * Recycle or remove all log files older or equal to passed segno
3739  *
3740  * endptr is current (or recent) end of xlog; this is used to determine
3741  * whether we want to recycle rather than delete no-longer-wanted log files.
3742  */
3743 static void
3744 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
3745 {
3746         XLogSegNo       endlogSegNo;
3747         int                     max_advance;
3748         DIR                *xldir;
3749         struct dirent *xlde;
3750         char            lastoff[MAXFNAMELEN];
3751         char            path[MAXPGPATH];
3752
3753 #ifdef WIN32
3754         char            newpath[MAXPGPATH];
3755 #endif
3756         struct stat statbuf;
3757
3758         /*
3759          * Initialize info about where to try to recycle to.  We allow recycling
3760          * segments up to XLOGfileslop segments beyond the current XLOG location.
3761          */
3762         XLByteToPrevSeg(endptr, endlogSegNo);
3763         max_advance = XLOGfileslop;
3764
3765         xldir = AllocateDir(XLOGDIR);
3766         if (xldir == NULL)
3767                 ereport(ERROR,
3768                                 (errcode_for_file_access(),
3769                                  errmsg("could not open transaction log directory \"%s\": %m",
3770                                                 XLOGDIR)));
3771
3772         /*
3773          * Construct a filename of the last segment to be kept. The timeline ID
3774          * doesn't matter, we ignore that in the comparison. (During recovery,
3775          * ThisTimeLineID isn't set, so we can't use that.)
3776          */
3777         XLogFileName(lastoff, 0, segno);
3778
3779         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3780                  lastoff);
3781
3782         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3783         {
3784                 /*
3785                  * We ignore the timeline part of the XLOG segment identifiers in
3786                  * deciding whether a segment is still needed.  This ensures that we
3787                  * won't prematurely remove a segment from a parent timeline. We could
3788                  * probably be a little more proactive about removing segments of
3789                  * non-parent timelines, but that would be a whole lot more
3790                  * complicated.
3791                  *
3792                  * We use the alphanumeric sorting property of the filenames to decide
3793                  * which ones are earlier than the lastoff segment.
3794                  */
3795                 if (strlen(xlde->d_name) == 24 &&
3796                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3797                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3798                 {
3799                         if (XLogArchiveCheckDone(xlde->d_name))
3800                         {
3801                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3802
3803                                 /* Update the last removed location in shared memory first */
3804                                 UpdateLastRemovedPtr(xlde->d_name);
3805
3806                                 /*
3807                                  * Before deleting the file, see if it can be recycled as a
3808                                  * future log segment. Only recycle normal files, pg_standby
3809                                  * for example can create symbolic links pointing to a
3810                                  * separate archive directory.
3811                                  */
3812                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3813                                         InstallXLogFileSegment(&endlogSegNo, path,
3814                                                                                    true, &max_advance, true))
3815                                 {
3816                                         ereport(DEBUG2,
3817                                                         (errmsg("recycled transaction log file \"%s\"",
3818                                                                         xlde->d_name)));
3819                                         CheckpointStats.ckpt_segs_recycled++;
3820                                         /* Needn't recheck that slot on future iterations */
3821                                         if (max_advance > 0)
3822                                         {
3823                                                 endlogSegNo++;
3824                                                 max_advance--;
3825                                         }
3826                                 }
3827                                 else
3828                                 {
3829                                         /* No need for any more future segments... */
3830                                         int                     rc;
3831
3832                                         ereport(DEBUG2,
3833                                                         (errmsg("removing transaction log file \"%s\"",
3834                                                                         xlde->d_name)));
3835
3836 #ifdef WIN32
3837                                         /*
3838                                          * On Windows, if another process (e.g another backend)
3839                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
3840                                          * will succeed, but the file will still show up in
3841                                          * directory listing until the last handle is closed. To
3842                                          * avoid confusing the lingering deleted file for a live
3843                                          * WAL file that needs to be archived, rename it before
3844                                          * deleting it.
3845                                          *
3846                                          * If another process holds the file open without
3847                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
3848                                          * again at the next checkpoint.
3849                                          */
3850                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
3851                                         if (rename(path, newpath) != 0)
3852                                         {
3853                                                 ereport(LOG,
3854                                                                 (errcode_for_file_access(),
3855                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
3856                                                                                 path)));
3857                                                 continue;
3858                                         }
3859                                         rc = unlink(newpath);
3860 #else
3861                                         rc = unlink(path);
3862 #endif
3863                                         if (rc != 0)
3864                                         {
3865                                                 ereport(LOG,
3866                                                                 (errcode_for_file_access(),
3867                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
3868                                                                                 path)));
3869                                                 continue;
3870                                         }
3871                                         CheckpointStats.ckpt_segs_removed++;
3872                                 }
3873
3874                                 XLogArchiveCleanup(xlde->d_name);
3875                         }
3876                 }
3877         }
3878
3879         FreeDir(xldir);
3880 }
3881
3882 /*
3883  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3884  * If the latter does not exist, recreate it.
3885  *
3886  * It is not the goal of this function to verify the contents of these
3887  * directories, but to help in cases where someone has performed a cluster
3888  * copy for PITR purposes but omitted pg_xlog from the copy.
3889  *
3890  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3891  * policy decision was made not to.  It is fairly common for pg_xlog to be
3892  * a symlink, and if that was the DBA's intent then automatically making a
3893  * plain directory would result in degraded performance with no notice.
3894  */
3895 static void
3896 ValidateXLOGDirectoryStructure(void)
3897 {
3898         char            path[MAXPGPATH];
3899         struct stat stat_buf;
3900
3901         /* Check for pg_xlog; if it doesn't exist, error out */
3902         if (stat(XLOGDIR, &stat_buf) != 0 ||
3903                 !S_ISDIR(stat_buf.st_mode))
3904                 ereport(FATAL,
3905                                 (errmsg("required WAL directory \"%s\" does not exist",
3906                                                 XLOGDIR)));
3907
3908         /* Check for archive_status */
3909         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3910         if (stat(path, &stat_buf) == 0)
3911         {
3912                 /* Check for weird cases where it exists but isn't a directory */
3913                 if (!S_ISDIR(stat_buf.st_mode))
3914                         ereport(FATAL,
3915                                         (errmsg("required WAL directory \"%s\" does not exist",
3916                                                         path)));
3917         }
3918         else
3919         {
3920                 ereport(LOG,
3921                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3922                 if (mkdir(path, S_IRWXU) < 0)
3923                         ereport(FATAL,
3924                                         (errmsg("could not create missing directory \"%s\": %m",
3925                                                         path)));
3926         }
3927 }
3928
3929 /*
3930  * Remove previous backup history files.  This also retries creation of
3931  * .ready files for any backup history files for which XLogArchiveNotify
3932  * failed earlier.
3933  */
3934 static void
3935 CleanupBackupHistory(void)
3936 {
3937         DIR                *xldir;
3938         struct dirent *xlde;
3939         char            path[MAXPGPATH];
3940
3941         xldir = AllocateDir(XLOGDIR);
3942         if (xldir == NULL)
3943                 ereport(ERROR,
3944                                 (errcode_for_file_access(),
3945                                  errmsg("could not open transaction log directory \"%s\": %m",
3946                                                 XLOGDIR)));
3947
3948         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3949         {
3950                 if (strlen(xlde->d_name) > 24 &&
3951                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3952                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3953                                    ".backup") == 0)
3954                 {
3955                         if (XLogArchiveCheckDone(xlde->d_name))
3956                         {
3957                                 ereport(DEBUG2,
3958                                 (errmsg("removing transaction log backup history file \"%s\"",
3959                                                 xlde->d_name)));
3960                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3961                                 unlink(path);
3962                                 XLogArchiveCleanup(xlde->d_name);
3963                         }
3964                 }
3965         }
3966
3967         FreeDir(xldir);
3968 }
3969
3970 /*
3971  * Restore a full-page image from a backup block attached to an XLOG record.
3972  *
3973  * lsn: LSN of the XLOG record being replayed
3974  * record: the complete XLOG record
3975  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
3976  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
3977  * keep_buffer: TRUE to return the buffer still locked and pinned
3978  *
3979  * Returns the buffer number containing the page.  Note this is not terribly
3980  * useful unless keep_buffer is specified as TRUE.
3981  *
3982  * Note: when a backup block is available in XLOG, we restore it
3983  * unconditionally, even if the page in the database appears newer.
3984  * This is to protect ourselves against database pages that were partially
3985  * or incorrectly written during a crash.  We assume that the XLOG data
3986  * must be good because it has passed a CRC check, while the database
3987  * page might not be.  This will force us to replay all subsequent
3988  * modifications of the page that appear in XLOG, rather than possibly
3989  * ignoring them as already applied, but that's not a huge drawback.
3990  *
3991  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
3992  * else a normal exclusive lock is used.  During crash recovery, that's just
3993  * pro forma because there can't be any regular backends in the system, but
3994  * in hot standby mode the distinction is important.
3995  *
3996  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
3997  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
3998  * is needed in some cases when replaying XLOG records that touch multiple
3999  * pages, to prevent inconsistent states from being visible to other backends.
4000  * (Again, that's only important in hot standby mode.)
4001  */
4002 Buffer
4003 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
4004                                    bool get_cleanup_lock, bool keep_buffer)
4005 {
4006         BkpBlock        bkpb;
4007         char       *blk;
4008         int                     i;
4009
4010         /* Locate requested BkpBlock in the record */
4011         blk = (char *) XLogRecGetData(record) + record->xl_len;
4012         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
4013         {
4014                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
4015                         continue;
4016
4017                 memcpy(&bkpb, blk, sizeof(BkpBlock));
4018                 blk += sizeof(BkpBlock);
4019
4020                 if (i == block_index)
4021                 {
4022                         /* Found it, apply the update */
4023                         return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
4024                                                                                           keep_buffer);
4025                 }
4026
4027                 blk += BLCKSZ - bkpb.hole_length;
4028         }
4029
4030         /* Caller specified a bogus block_index */
4031         elog(ERROR, "failed to restore block_index %d", block_index);
4032         return InvalidBuffer;           /* keep compiler quiet */
4033 }
4034
4035 /*
4036  * Workhorse for RestoreBackupBlock usable without an xlog record
4037  *
4038  * Restores a full-page image from BkpBlock and a data pointer.
4039  */
4040 static Buffer
4041 RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
4042                                                    bool get_cleanup_lock, bool keep_buffer)
4043 {
4044         Buffer          buffer;
4045         Page            page;
4046
4047         buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
4048                                                                         RBM_ZERO);
4049         Assert(BufferIsValid(buffer));
4050         if (get_cleanup_lock)
4051                 LockBufferForCleanup(buffer);
4052         else
4053                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4054
4055         page = (Page) BufferGetPage(buffer);
4056
4057         if (bkpb.hole_length == 0)
4058         {
4059                 memcpy((char *) page, blk, BLCKSZ);
4060         }
4061         else
4062         {
4063                 memcpy((char *) page, blk, bkpb.hole_offset);
4064                 /* must zero-fill the hole */
4065                 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
4066                 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
4067                            blk + bkpb.hole_offset,
4068                            BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
4069         }
4070
4071         /*
4072          * The checksum value on this page is currently invalid. We don't need to
4073          * reset it here since it will be set before being written.
4074          */
4075
4076         PageSetLSN(page, lsn);
4077         MarkBufferDirty(buffer);
4078
4079         if (!keep_buffer)
4080                 UnlockReleaseBuffer(buffer);
4081
4082         return buffer;
4083 }
4084
4085 /*
4086  * Attempt to read an XLOG record.
4087  *
4088  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
4089  * try to read a record just after the last one previously read.
4090  *
4091  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4092  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4093  * record is available.
4094  *
4095  * The record is copied into readRecordBuf, so that on successful return,
4096  * the returned record pointer always points there.
4097  */
4098 static XLogRecord *
4099 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4100                    bool fetching_ckpt)
4101 {
4102         XLogRecord *record;
4103         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4104
4105         /* Pass through parameters to XLogPageRead */
4106         private->fetching_ckpt = fetching_ckpt;
4107         private->emode = emode;
4108         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4109
4110         /* This is the first attempt to read this page. */
4111         lastSourceFailed = false;
4112
4113         for (;;)
4114         {
4115                 char       *errormsg;
4116
4117                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4118                 ReadRecPtr = xlogreader->ReadRecPtr;
4119                 EndRecPtr = xlogreader->EndRecPtr;
4120                 if (record == NULL)
4121                 {
4122                         if (readFile >= 0)
4123                         {
4124                                 close(readFile);
4125                                 readFile = -1;
4126                         }
4127
4128                         /*
4129                          * We only end up here without a message when XLogPageRead()
4130                          * failed - in that case we already logged something. In
4131                          * StandbyMode that only happens if we have been triggered, so we
4132                          * shouldn't loop anymore in that case.
4133                          */
4134                         if (errormsg)
4135                                 ereport(emode_for_corrupt_record(emode,
4136                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4137                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4138                 }
4139
4140                 /*
4141                  * Check page TLI is one of the expected values.
4142                  */
4143                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4144                 {
4145                         char            fname[MAXFNAMELEN];
4146                         XLogSegNo       segno;
4147                         int32           offset;
4148
4149                         XLByteToSeg(xlogreader->latestPagePtr, segno);
4150                         offset = xlogreader->latestPagePtr % XLogSegSize;
4151                         XLogFileName(fname, xlogreader->readPageTLI, segno);
4152                         ereport(emode_for_corrupt_record(emode,
4153                                                                                          RecPtr ? RecPtr : EndRecPtr),
4154                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4155                                         xlogreader->latestPageTLI,
4156                                         fname,
4157                                         offset)));
4158                         record = NULL;
4159                 }
4160
4161                 if (record)
4162                 {
4163                         /* Great, got a record */
4164                         return record;
4165                 }
4166                 else
4167                 {
4168                         /* No valid record available from this source */
4169                         lastSourceFailed = true;
4170
4171                         /*
4172                          * If archive recovery was requested, but we were still doing
4173                          * crash recovery, switch to archive recovery and retry using the
4174                          * offline archive. We have now replayed all the valid WAL in
4175                          * pg_xlog, so we are presumably now consistent.
4176                          *
4177                          * We require that there's at least some valid WAL present in
4178                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
4179                          * from the archive, even if pg_xlog is completely empty, but we'd
4180                          * have no idea how far we'd have to replay to reach consistency.
4181                          * So err on the safe side and give up.
4182                          */
4183                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4184                                 !fetching_ckpt)
4185                         {
4186                                 ereport(DEBUG1,
4187                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
4188                                 InArchiveRecovery = true;
4189                                 if (StandbyModeRequested)
4190                                         StandbyMode = true;
4191
4192                                 /* initialize minRecoveryPoint to this record */
4193                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4194                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4195                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4196                                 {
4197                                         ControlFile->minRecoveryPoint = EndRecPtr;
4198                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4199                                 }
4200                                 /* update local copy */
4201                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4202                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4203
4204                                 UpdateControlFile();
4205                                 LWLockRelease(ControlFileLock);
4206
4207                                 CheckRecoveryConsistency();
4208
4209                                 /*
4210                                  * Before we retry, reset lastSourceFailed and currentSource
4211                                  * so that we will check the archive next.
4212                                  */
4213                                 lastSourceFailed = false;
4214                                 currentSource = 0;
4215
4216                                 continue;
4217                         }
4218
4219                         /* In standby mode, loop back to retry. Otherwise, give up. */
4220                         if (StandbyMode && !CheckForStandbyTrigger())
4221                                 continue;
4222                         else
4223                                 return NULL;
4224                 }
4225         }
4226 }
4227
4228 /*
4229  * Scan for new timelines that might have appeared in the archive since we
4230  * started recovery.
4231  *
4232  * If there are any, the function changes recovery target TLI to the latest
4233  * one and returns 'true'.
4234  */
4235 static bool
4236 rescanLatestTimeLine(void)
4237 {
4238         List       *newExpectedTLEs;
4239         bool            found;
4240         ListCell   *cell;
4241         TimeLineID      newtarget;
4242         TimeLineID      oldtarget = recoveryTargetTLI;
4243         TimeLineHistoryEntry *currentTle = NULL;
4244
4245         newtarget = findNewestTimeLine(recoveryTargetTLI);
4246         if (newtarget == recoveryTargetTLI)
4247         {
4248                 /* No new timelines found */
4249                 return false;
4250         }
4251
4252         /*
4253          * Determine the list of expected TLIs for the new TLI
4254          */
4255
4256         newExpectedTLEs = readTimeLineHistory(newtarget);
4257
4258         /*
4259          * If the current timeline is not part of the history of the new timeline,
4260          * we cannot proceed to it.
4261          */
4262         found = false;
4263         foreach(cell, newExpectedTLEs)
4264         {
4265                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4266
4267                 if (currentTle->tli == recoveryTargetTLI)
4268                 {
4269                         found = true;
4270                         break;
4271                 }
4272         }
4273         if (!found)
4274         {
4275                 ereport(LOG,
4276                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4277                                                 newtarget,
4278                                                 ThisTimeLineID)));
4279                 return false;
4280         }
4281
4282         /*
4283          * The current timeline was found in the history file, but check that the
4284          * next timeline was forked off from it *after* the current recovery
4285          * location.
4286          */
4287         if (currentTle->end < EndRecPtr)
4288         {
4289                 ereport(LOG,
4290                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4291                                                 newtarget,
4292                                                 ThisTimeLineID,
4293                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4294                 return false;
4295         }
4296
4297         /* The new timeline history seems valid. Switch target */
4298         recoveryTargetTLI = newtarget;
4299         list_free_deep(expectedTLEs);
4300         expectedTLEs = newExpectedTLEs;
4301
4302         /*
4303          * As in StartupXLOG(), try to ensure we have all the history files
4304          * between the old target and new target in pg_xlog.
4305          */
4306         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4307
4308         ereport(LOG,
4309                         (errmsg("new target timeline is %u",
4310                                         recoveryTargetTLI)));
4311
4312         return true;
4313 }
4314
4315 /*
4316  * I/O routines for pg_control
4317  *
4318  * *ControlFile is a buffer in shared memory that holds an image of the
4319  * contents of pg_control.      WriteControlFile() initializes pg_control
4320  * given a preloaded buffer, ReadControlFile() loads the buffer from
4321  * the pg_control file (during postmaster or standalone-backend startup),
4322  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4323  *
4324  * For simplicity, WriteControlFile() initializes the fields of pg_control
4325  * that are related to checking backend/database compatibility, and
4326  * ReadControlFile() verifies they are correct.  We could split out the
4327  * I/O and compatibility-check functions, but there seems no need currently.
4328  */
4329 static void
4330 WriteControlFile(void)
4331 {
4332         int                     fd;
4333         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4334
4335         /*
4336          * Initialize version and compatibility-check fields
4337          */
4338         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4339         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4340
4341         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4342         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4343
4344         ControlFile->blcksz = BLCKSZ;
4345         ControlFile->relseg_size = RELSEG_SIZE;
4346         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4347         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4348
4349         ControlFile->nameDataLen = NAMEDATALEN;
4350         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4351
4352         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4353
4354 #ifdef HAVE_INT64_TIMESTAMP
4355         ControlFile->enableIntTimes = true;
4356 #else
4357         ControlFile->enableIntTimes = false;
4358 #endif
4359         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4360         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4361
4362         /* Contents are protected with a CRC */
4363         INIT_CRC32(ControlFile->crc);
4364         COMP_CRC32(ControlFile->crc,
4365                            (char *) ControlFile,
4366                            offsetof(ControlFileData, crc));
4367         FIN_CRC32(ControlFile->crc);
4368
4369         /*
4370          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4371          * excess over sizeof(ControlFileData).  This reduces the odds of
4372          * premature-EOF errors when reading pg_control.  We'll still fail when we
4373          * check the contents of the file, but hopefully with a more specific
4374          * error than "couldn't read pg_control".
4375          */
4376         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4377                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4378
4379         memset(buffer, 0, PG_CONTROL_SIZE);
4380         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4381
4382         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4383                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4384                                            S_IRUSR | S_IWUSR);
4385         if (fd < 0)
4386                 ereport(PANIC,
4387                                 (errcode_for_file_access(),
4388                                  errmsg("could not create control file \"%s\": %m",
4389                                                 XLOG_CONTROL_FILE)));
4390
4391         errno = 0;
4392         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4393         {
4394                 /* if write didn't set errno, assume problem is no disk space */
4395                 if (errno == 0)
4396                         errno = ENOSPC;
4397                 ereport(PANIC,
4398                                 (errcode_for_file_access(),
4399                                  errmsg("could not write to control file: %m")));
4400         }
4401
4402         if (pg_fsync(fd) != 0)
4403                 ereport(PANIC,
4404                                 (errcode_for_file_access(),
4405                                  errmsg("could not fsync control file: %m")));
4406
4407         if (close(fd))
4408                 ereport(PANIC,
4409                                 (errcode_for_file_access(),
4410                                  errmsg("could not close control file: %m")));
4411 }
4412
4413 static void
4414 ReadControlFile(void)
4415 {
4416         pg_crc32        crc;
4417         int                     fd;
4418
4419         /*
4420          * Read data...
4421          */
4422         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4423                                            O_RDWR | PG_BINARY,
4424                                            S_IRUSR | S_IWUSR);
4425         if (fd < 0)
4426                 ereport(PANIC,
4427                                 (errcode_for_file_access(),
4428                                  errmsg("could not open control file \"%s\": %m",
4429                                                 XLOG_CONTROL_FILE)));
4430
4431         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4432                 ereport(PANIC,
4433                                 (errcode_for_file_access(),
4434                                  errmsg("could not read from control file: %m")));
4435
4436         close(fd);
4437
4438         /*
4439          * Check for expected pg_control format version.  If this is wrong, the
4440          * CRC check will likely fail because we'll be checking the wrong number
4441          * of bytes.  Complaining about wrong version will probably be more
4442          * enlightening than complaining about wrong CRC.
4443          */
4444
4445         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4446                 ereport(FATAL,
4447                                 (errmsg("database files are incompatible with server"),
4448                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4449                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4450                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4451                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4452                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4453
4454         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4455                 ereport(FATAL,
4456                                 (errmsg("database files are incompatible with server"),
4457                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4458                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4459                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4460                                  errhint("It looks like you need to initdb.")));
4461
4462         /* Now check the CRC. */
4463         INIT_CRC32(crc);
4464         COMP_CRC32(crc,
4465                            (char *) ControlFile,
4466                            offsetof(ControlFileData, crc));
4467         FIN_CRC32(crc);
4468
4469         if (!EQ_CRC32(crc, ControlFile->crc))
4470                 ereport(FATAL,
4471                                 (errmsg("incorrect checksum in control file")));
4472
4473         /*
4474          * Do compatibility checking immediately.  If the database isn't
4475          * compatible with the backend executable, we want to abort before we can
4476          * possibly do any damage.
4477          */
4478         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4479                 ereport(FATAL,
4480                                 (errmsg("database files are incompatible with server"),
4481                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4482                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4483                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4484                                  errhint("It looks like you need to initdb.")));
4485         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4486                 ereport(FATAL,
4487                                 (errmsg("database files are incompatible with server"),
4488                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4489                                          " but the server was compiled with MAXALIGN %d.",
4490                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4491                                  errhint("It looks like you need to initdb.")));
4492         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4493                 ereport(FATAL,
4494                                 (errmsg("database files are incompatible with server"),
4495                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4496                                  errhint("It looks like you need to initdb.")));
4497         if (ControlFile->blcksz != BLCKSZ)
4498                 ereport(FATAL,
4499                                 (errmsg("database files are incompatible with server"),
4500                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4501                                            " but the server was compiled with BLCKSZ %d.",
4502                                            ControlFile->blcksz, BLCKSZ),
4503                                  errhint("It looks like you need to recompile or initdb.")));
4504         if (ControlFile->relseg_size != RELSEG_SIZE)
4505                 ereport(FATAL,
4506                                 (errmsg("database files are incompatible with server"),
4507                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4508                                   " but the server was compiled with RELSEG_SIZE %d.",
4509                                   ControlFile->relseg_size, RELSEG_SIZE),
4510                                  errhint("It looks like you need to recompile or initdb.")));
4511         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4512                 ereport(FATAL,
4513                                 (errmsg("database files are incompatible with server"),
4514                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4515                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4516                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4517                                  errhint("It looks like you need to recompile or initdb.")));
4518         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4519                 ereport(FATAL,
4520                                 (errmsg("database files are incompatible with server"),
4521                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4522                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4523                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4524                                  errhint("It looks like you need to recompile or initdb.")));
4525         if (ControlFile->nameDataLen != NAMEDATALEN)
4526                 ereport(FATAL,
4527                                 (errmsg("database files are incompatible with server"),
4528                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4529                                   " but the server was compiled with NAMEDATALEN %d.",
4530                                   ControlFile->nameDataLen, NAMEDATALEN),
4531                                  errhint("It looks like you need to recompile or initdb.")));
4532         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4533                 ereport(FATAL,
4534                                 (errmsg("database files are incompatible with server"),
4535                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4536                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4537                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4538                                  errhint("It looks like you need to recompile or initdb.")));
4539         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4540                 ereport(FATAL,
4541                                 (errmsg("database files are incompatible with server"),
4542                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4543                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4544                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4545                                  errhint("It looks like you need to recompile or initdb.")));
4546
4547 #ifdef HAVE_INT64_TIMESTAMP
4548         if (ControlFile->enableIntTimes != true)
4549                 ereport(FATAL,
4550                                 (errmsg("database files are incompatible with server"),
4551                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4552                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4553                                  errhint("It looks like you need to recompile or initdb.")));
4554 #else
4555         if (ControlFile->enableIntTimes != false)
4556                 ereport(FATAL,
4557                                 (errmsg("database files are incompatible with server"),
4558                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4559                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4560                                  errhint("It looks like you need to recompile or initdb.")));
4561 #endif
4562
4563 #ifdef USE_FLOAT4_BYVAL
4564         if (ControlFile->float4ByVal != true)
4565                 ereport(FATAL,
4566                                 (errmsg("database files are incompatible with server"),
4567                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4568                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4569                                  errhint("It looks like you need to recompile or initdb.")));
4570 #else
4571         if (ControlFile->float4ByVal != false)
4572                 ereport(FATAL,
4573                                 (errmsg("database files are incompatible with server"),
4574                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4575                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4576                                  errhint("It looks like you need to recompile or initdb.")));
4577 #endif
4578
4579 #ifdef USE_FLOAT8_BYVAL
4580         if (ControlFile->float8ByVal != true)
4581                 ereport(FATAL,
4582                                 (errmsg("database files are incompatible with server"),
4583                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4584                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4585                                  errhint("It looks like you need to recompile or initdb.")));
4586 #else
4587         if (ControlFile->float8ByVal != false)
4588                 ereport(FATAL,
4589                                 (errmsg("database files are incompatible with server"),
4590                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4591                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4592                                  errhint("It looks like you need to recompile or initdb.")));
4593 #endif
4594
4595         /* Make the initdb settings visible as GUC variables, too */
4596         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4597                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4598 }
4599
4600 void
4601 UpdateControlFile(void)
4602 {
4603         int                     fd;
4604
4605         INIT_CRC32(ControlFile->crc);
4606         COMP_CRC32(ControlFile->crc,
4607                            (char *) ControlFile,
4608                            offsetof(ControlFileData, crc));
4609         FIN_CRC32(ControlFile->crc);
4610
4611         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4612                                            O_RDWR | PG_BINARY,
4613                                            S_IRUSR | S_IWUSR);
4614         if (fd < 0)
4615                 ereport(PANIC,
4616                                 (errcode_for_file_access(),
4617                                  errmsg("could not open control file \"%s\": %m",
4618                                                 XLOG_CONTROL_FILE)));
4619
4620         errno = 0;
4621         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4622         {
4623                 /* if write didn't set errno, assume problem is no disk space */
4624                 if (errno == 0)
4625                         errno = ENOSPC;
4626                 ereport(PANIC,
4627                                 (errcode_for_file_access(),
4628                                  errmsg("could not write to control file: %m")));
4629         }
4630
4631         if (pg_fsync(fd) != 0)
4632                 ereport(PANIC,
4633                                 (errcode_for_file_access(),
4634                                  errmsg("could not fsync control file: %m")));
4635
4636         if (close(fd))
4637                 ereport(PANIC,
4638                                 (errcode_for_file_access(),
4639                                  errmsg("could not close control file: %m")));
4640 }
4641
4642 /*
4643  * Returns the unique system identifier from control file.
4644  */
4645 uint64
4646 GetSystemIdentifier(void)
4647 {
4648         Assert(ControlFile != NULL);
4649         return ControlFile->system_identifier;
4650 }
4651
4652 /*
4653  * Are checksums enabled for data pages?
4654  */
4655 bool
4656 DataChecksumsEnabled(void)
4657 {
4658         Assert(ControlFile != NULL);
4659         return (ControlFile->data_checksum_version > 0);
4660 }
4661
4662 /*
4663  * Returns a fake LSN for unlogged relations.
4664  *
4665  * Each call generates an LSN that is greater than any previous value
4666  * returned. The current counter value is saved and restored across clean
4667  * shutdowns, but like unlogged relations, does not survive a crash. This can
4668  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4669  * LSN-like increasing sequence of numbers without writing any WAL.
4670  */
4671 XLogRecPtr
4672 GetFakeLSNForUnloggedRel(void)
4673 {
4674         XLogRecPtr      nextUnloggedLSN;
4675
4676         /* use volatile pointer to prevent code rearrangement */
4677         volatile XLogCtlData *xlogctl = XLogCtl;
4678
4679         /* increment the unloggedLSN counter, need SpinLock */
4680         SpinLockAcquire(&xlogctl->ulsn_lck);
4681         nextUnloggedLSN = xlogctl->unloggedLSN++;
4682         SpinLockRelease(&xlogctl->ulsn_lck);
4683
4684         return nextUnloggedLSN;
4685 }
4686
4687 /*
4688  * Auto-tune the number of XLOG buffers.
4689  *
4690  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4691  * a maximum of one XLOG segment (there is little reason to think that more
4692  * is helpful, at least so long as we force an fsync when switching log files)
4693  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4694  * 9.1, when auto-tuning was added).
4695  *
4696  * This should not be called until NBuffers has received its final value.
4697  */
4698 static int
4699 XLOGChooseNumBuffers(void)
4700 {
4701         int                     xbuffers;
4702
4703         xbuffers = NBuffers / 32;
4704         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4705                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4706         if (xbuffers < 8)
4707                 xbuffers = 8;
4708         return xbuffers;
4709 }
4710
4711 /*
4712  * GUC check_hook for wal_buffers
4713  */
4714 bool
4715 check_wal_buffers(int *newval, void **extra, GucSource source)
4716 {
4717         /*
4718          * -1 indicates a request for auto-tune.
4719          */
4720         if (*newval == -1)
4721         {
4722                 /*
4723                  * If we haven't yet changed the boot_val default of -1, just let it
4724                  * be.  We'll fix it when XLOGShmemSize is called.
4725                  */
4726                 if (XLOGbuffers == -1)
4727                         return true;
4728
4729                 /* Otherwise, substitute the auto-tune value */
4730                 *newval = XLOGChooseNumBuffers();
4731         }
4732
4733         /*
4734          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4735          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4736          * the case, we just silently treat such values as a request for the
4737          * minimum.  (We could throw an error instead, but that doesn't seem very
4738          * helpful.)
4739          */
4740         if (*newval < 4)
4741                 *newval = 4;
4742
4743         return true;
4744 }
4745
4746 /*
4747  * Initialization of shared memory for XLOG
4748  */
4749 Size
4750 XLOGShmemSize(void)
4751 {
4752         Size            size;
4753
4754         /*
4755          * If the value of wal_buffers is -1, use the preferred auto-tune value.
4756          * This isn't an amazingly clean place to do this, but we must wait till
4757          * NBuffers has received its final value, and must do it before using the
4758          * value of XLOGbuffers to do anything important.
4759          */
4760         if (XLOGbuffers == -1)
4761         {
4762                 char            buf[32];
4763
4764                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4765                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4766         }
4767         Assert(XLOGbuffers > 0);
4768
4769         /* XLogCtl */
4770         size = sizeof(XLogCtlData);
4771
4772         /* WAL insertion locks, plus alignment */
4773         size = add_size(size, mul_size(sizeof(WALInsertLockPadded), num_xloginsert_locks + 1));
4774         /* xlblocks array */
4775         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4776         /* extra alignment padding for XLOG I/O buffers */
4777         size = add_size(size, XLOG_BLCKSZ);
4778         /* and the buffers themselves */
4779         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4780
4781         /*
4782          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4783          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4784          * routine again below to compute the actual allocation size.
4785          */
4786
4787         return size;
4788 }
4789
4790 void
4791 XLOGShmemInit(void)
4792 {
4793         bool            foundCFile,
4794                                 foundXLog;
4795         char       *allocptr;
4796         int                     i;
4797
4798         ControlFile = (ControlFileData *)
4799                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4800         XLogCtl = (XLogCtlData *)
4801                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4802
4803         if (foundCFile || foundXLog)
4804         {
4805                 /* both should be present or neither */
4806                 Assert(foundCFile && foundXLog);
4807                 return;
4808         }
4809         memset(XLogCtl, 0, sizeof(XLogCtlData));
4810
4811         /*
4812          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4813          * multiple of the alignment for same, so no extra alignment padding is
4814          * needed here.
4815          */
4816         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4817         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4818         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4819         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4820
4821
4822         /* WAL insertion locks. Ensure they're aligned to the full padded size */
4823         allocptr += sizeof(WALInsertLockPadded) -
4824                 ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
4825         WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
4826                 (WALInsertLockPadded *) allocptr;
4827         allocptr += sizeof(WALInsertLockPadded) * num_xloginsert_locks;
4828
4829         XLogCtl->Insert.WALInsertLockTrancheId = LWLockNewTrancheId();
4830
4831         XLogCtl->Insert.WALInsertLockTranche.name = "WALInsertLocks";
4832         XLogCtl->Insert.WALInsertLockTranche.array_base = WALInsertLocks;
4833         XLogCtl->Insert.WALInsertLockTranche.array_stride = sizeof(WALInsertLockPadded);
4834
4835         LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId, &XLogCtl->Insert.WALInsertLockTranche);
4836         for (i = 0; i < num_xloginsert_locks; i++)
4837         {
4838                 LWLockInitialize(&WALInsertLocks[i].l.lock,
4839                                                  XLogCtl->Insert.WALInsertLockTrancheId);
4840                 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
4841         }
4842
4843         /*
4844          * Align the start of the page buffers to a full xlog block size boundary.
4845          * This simplifies some calculations in XLOG insertion. It is also required
4846          * for O_DIRECT.
4847          */
4848         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
4849         XLogCtl->pages = allocptr;
4850         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4851
4852         /*
4853          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4854          * in additional info.)
4855          */
4856         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4857         XLogCtl->SharedRecoveryInProgress = true;
4858         XLogCtl->SharedHotStandbyActive = false;
4859         XLogCtl->WalWriterSleeping = false;
4860
4861         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
4862         SpinLockInit(&XLogCtl->info_lck);
4863         SpinLockInit(&XLogCtl->ulsn_lck);
4864         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
4865
4866         /*
4867          * If we are not in bootstrap mode, pg_control should already exist. Read
4868          * and validate it immediately (see comments in ReadControlFile() for the
4869          * reasons why).
4870          */
4871         if (!IsBootstrapProcessingMode())
4872                 ReadControlFile();
4873 }
4874
4875 /*
4876  * This func must be called ONCE on system install.  It creates pg_control
4877  * and the initial XLOG segment.
4878  */
4879 void
4880 BootStrapXLOG(void)
4881 {
4882         CheckPoint      checkPoint;
4883         char       *buffer;
4884         XLogPageHeader page;
4885         XLogLongPageHeader longpage;
4886         XLogRecord *record;
4887         bool            use_existent;
4888         uint64          sysidentifier;
4889         struct timeval tv;
4890         pg_crc32        crc;
4891
4892         /*
4893          * Select a hopefully-unique system identifier code for this installation.
4894          * We use the result of gettimeofday(), including the fractional seconds
4895          * field, as being about as unique as we can easily get.  (Think not to
4896          * use random(), since it hasn't been seeded and there's no portable way
4897          * to seed it other than the system clock value...)  The upper half of the
4898          * uint64 value is just the tv_sec part, while the lower half is the XOR
4899          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4900          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4901          * knowing this encoding can determine the initialization time of the
4902          * installation, which could perhaps be useful sometimes.
4903          */
4904         gettimeofday(&tv, NULL);
4905         sysidentifier = ((uint64) tv.tv_sec) << 32;
4906         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4907
4908         /* First timeline ID is always 1 */
4909         ThisTimeLineID = 1;
4910
4911         /* page buffer must be aligned suitably for O_DIRECT */
4912         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
4913         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
4914         memset(page, 0, XLOG_BLCKSZ);
4915
4916         /*
4917          * Set up information for the initial checkpoint record
4918          *
4919          * The initial checkpoint record is written to the beginning of the WAL
4920          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4921          * used, so that we can use 0/0 to mean "before any valid WAL segment".
4922          */
4923         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
4924         checkPoint.ThisTimeLineID = ThisTimeLineID;
4925         checkPoint.PrevTimeLineID = ThisTimeLineID;
4926         checkPoint.fullPageWrites = fullPageWrites;
4927         checkPoint.nextXidEpoch = 0;
4928         checkPoint.nextXid = FirstNormalTransactionId;
4929         checkPoint.nextOid = FirstBootstrapObjectId;
4930         checkPoint.nextMulti = FirstMultiXactId;
4931         checkPoint.nextMultiOffset = 0;
4932         checkPoint.oldestXid = FirstNormalTransactionId;
4933         checkPoint.oldestXidDB = TemplateDbOid;
4934         checkPoint.oldestMulti = FirstMultiXactId;
4935         checkPoint.oldestMultiDB = TemplateDbOid;
4936         checkPoint.time = (pg_time_t) time(NULL);
4937         checkPoint.oldestActiveXid = InvalidTransactionId;
4938
4939         ShmemVariableCache->nextXid = checkPoint.nextXid;
4940         ShmemVariableCache->nextOid = checkPoint.nextOid;
4941         ShmemVariableCache->oidCount = 0;
4942         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4943         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4944         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
4945
4946         /* Set up the XLOG page header */
4947         page->xlp_magic = XLOG_PAGE_MAGIC;
4948         page->xlp_info = XLP_LONG_HEADER;
4949         page->xlp_tli = ThisTimeLineID;
4950         page->xlp_pageaddr = XLogSegSize;
4951         longpage = (XLogLongPageHeader) page;
4952         longpage->xlp_sysid = sysidentifier;
4953         longpage->xlp_seg_size = XLogSegSize;
4954         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4955
4956         /* Insert the initial checkpoint record */
4957         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4958         record->xl_prev = 0;
4959         record->xl_xid = InvalidTransactionId;
4960         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4961         record->xl_len = sizeof(checkPoint);
4962         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4963         record->xl_rmid = RM_XLOG_ID;
4964         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4965
4966         INIT_CRC32(crc);
4967         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4968         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
4969         FIN_CRC32(crc);
4970         record->xl_crc = crc;
4971
4972         /* Create first XLOG segment file */
4973         use_existent = false;
4974         openLogFile = XLogFileInit(1, &use_existent, false);
4975
4976         /* Write the first page with the initial record */
4977         errno = 0;
4978         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4979         {
4980                 /* if write didn't set errno, assume problem is no disk space */
4981                 if (errno == 0)
4982                         errno = ENOSPC;
4983                 ereport(PANIC,
4984                                 (errcode_for_file_access(),
4985                           errmsg("could not write bootstrap transaction log file: %m")));
4986         }
4987
4988         if (pg_fsync(openLogFile) != 0)
4989                 ereport(PANIC,
4990                                 (errcode_for_file_access(),
4991                           errmsg("could not fsync bootstrap transaction log file: %m")));
4992
4993         if (close(openLogFile))
4994                 ereport(PANIC,
4995                                 (errcode_for_file_access(),
4996                           errmsg("could not close bootstrap transaction log file: %m")));
4997
4998         openLogFile = -1;
4999
5000         /* Now create pg_control */
5001
5002         memset(ControlFile, 0, sizeof(ControlFileData));
5003         /* Initialize pg_control status fields */
5004         ControlFile->system_identifier = sysidentifier;
5005         ControlFile->state = DB_SHUTDOWNED;
5006         ControlFile->time = checkPoint.time;
5007         ControlFile->checkPoint = checkPoint.redo;
5008         ControlFile->checkPointCopy = checkPoint;
5009         ControlFile->unloggedLSN = 1;
5010
5011         /* Set important parameter values for use when replaying WAL */
5012         ControlFile->MaxConnections = MaxConnections;
5013         ControlFile->max_worker_processes = max_worker_processes;
5014         ControlFile->max_prepared_xacts = max_prepared_xacts;
5015         ControlFile->max_locks_per_xact = max_locks_per_xact;
5016         ControlFile->wal_level = wal_level;
5017         ControlFile->wal_log_hints = wal_log_hints;
5018         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5019
5020         /* some additional ControlFile fields are set in WriteControlFile() */
5021
5022         WriteControlFile();
5023
5024         /* Bootstrap the commit log, too */
5025         BootStrapCLOG();
5026         BootStrapSUBTRANS();
5027         BootStrapMultiXact();
5028
5029         pfree(buffer);
5030 }
5031
5032 static char *
5033 str_time(pg_time_t tnow)
5034 {
5035         static char buf[128];
5036
5037         pg_strftime(buf, sizeof(buf),
5038                                 "%Y-%m-%d %H:%M:%S %Z",
5039                                 pg_localtime(&tnow, log_timezone));
5040
5041         return buf;
5042 }
5043
5044 /*
5045  * See if there is a recovery command file (recovery.conf), and if so
5046  * read in parameters for archive recovery and XLOG streaming.
5047  *
5048  * The file is parsed using the main configuration parser.
5049  */
5050 static void
5051 readRecoveryCommandFile(void)
5052 {
5053         FILE       *fd;
5054         TimeLineID      rtli = 0;
5055         bool            rtliGiven = false;
5056         ConfigVariable *item,
5057                            *head = NULL,
5058                            *tail = NULL;
5059
5060         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5061         if (fd == NULL)
5062         {
5063                 if (errno == ENOENT)
5064                         return;                         /* not there, so no archive recovery */
5065                 ereport(FATAL,
5066                                 (errcode_for_file_access(),
5067                                  errmsg("could not open recovery command file \"%s\": %m",
5068                                                 RECOVERY_COMMAND_FILE)));
5069         }
5070
5071         /*
5072          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5073          * no need to check the return value.
5074          */
5075         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5076
5077         FreeFile(fd);
5078
5079         for (item = head; item; item = item->next)
5080         {
5081                 if (strcmp(item->name, "restore_command") == 0)
5082                 {
5083                         recoveryRestoreCommand = pstrdup(item->value);
5084                         ereport(DEBUG2,
5085                                         (errmsg_internal("restore_command = '%s'",
5086                                                                          recoveryRestoreCommand)));
5087                 }
5088                 else if (strcmp(item->name, "recovery_end_command") == 0)
5089                 {
5090                         recoveryEndCommand = pstrdup(item->value);
5091                         ereport(DEBUG2,
5092                                         (errmsg_internal("recovery_end_command = '%s'",
5093                                                                          recoveryEndCommand)));
5094                 }
5095                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5096                 {
5097                         archiveCleanupCommand = pstrdup(item->value);
5098                         ereport(DEBUG2,
5099                                         (errmsg_internal("archive_cleanup_command = '%s'",
5100                                                                          archiveCleanupCommand)));
5101                 }
5102                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
5103                 {
5104                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
5105                                 ereport(ERROR,
5106                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5107                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
5108                         ereport(DEBUG2,
5109                                         (errmsg_internal("pause_at_recovery_target = '%s'",
5110                                                                          item->value)));
5111                 }
5112                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5113                 {
5114                         rtliGiven = true;
5115                         if (strcmp(item->value, "latest") == 0)
5116                                 rtli = 0;
5117                         else
5118                         {
5119                                 errno = 0;
5120                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5121                                 if (errno == EINVAL || errno == ERANGE)
5122                                         ereport(FATAL,
5123                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5124                                                                         item->value)));
5125                         }
5126                         if (rtli)
5127                                 ereport(DEBUG2,
5128                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
5129                         else
5130                                 ereport(DEBUG2,
5131                                          (errmsg_internal("recovery_target_timeline = latest")));
5132                 }
5133                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5134                 {
5135                         errno = 0;
5136                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5137                         if (errno == EINVAL || errno == ERANGE)
5138                                 ereport(FATAL,
5139                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5140                                                  item->value)));
5141                         ereport(DEBUG2,
5142                                         (errmsg_internal("recovery_target_xid = %u",
5143                                                                          recoveryTargetXid)));
5144                         recoveryTarget = RECOVERY_TARGET_XID;
5145                 }
5146                 else if (strcmp(item->name, "recovery_target_time") == 0)
5147                 {
5148                         recoveryTarget = RECOVERY_TARGET_TIME;
5149
5150                         /*
5151                          * Convert the time string given by the user to TimestampTz form.
5152                          */
5153                         recoveryTargetTime =
5154                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5155                                                                                                 CStringGetDatum(item->value),
5156                                                                                                 ObjectIdGetDatum(InvalidOid),
5157                                                                                                                 Int32GetDatum(-1)));
5158                         ereport(DEBUG2,
5159                                         (errmsg_internal("recovery_target_time = '%s'",
5160                                                                    timestamptz_to_str(recoveryTargetTime))));
5161                 }
5162                 else if (strcmp(item->name, "recovery_target_name") == 0)
5163                 {
5164                         recoveryTarget = RECOVERY_TARGET_NAME;
5165
5166                         recoveryTargetName = pstrdup(item->value);
5167                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5168                                 ereport(FATAL,
5169                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5170                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5171                                                                 MAXFNAMELEN - 1)));
5172
5173                         ereport(DEBUG2,
5174                                         (errmsg_internal("recovery_target_name = '%s'",
5175                                                                          recoveryTargetName)));
5176                 }
5177                 else if (strcmp(item->name, "recovery_target") == 0)
5178                 {
5179                         if (strcmp(item->value, "immediate") == 0)
5180                                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5181                         else
5182                                 ereport(ERROR,
5183                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5184                                                  errmsg("invalid recovery_target parameter"),
5185                                                  errhint("The only allowed value is 'immediate'")));
5186                         ereport(DEBUG2,
5187                                         (errmsg_internal("recovery_target = '%s'",
5188                                                                          item->value)));
5189                 }
5190                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5191                 {
5192                         /*
5193                          * does nothing if a recovery_target is not also set
5194                          */
5195                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5196                                 ereport(ERROR,
5197                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5198                                                  errmsg("parameter \"%s\" requires a Boolean value",
5199                                                                 "recovery_target_inclusive")));
5200                         ereport(DEBUG2,
5201                                         (errmsg_internal("recovery_target_inclusive = %s",
5202                                                                          item->value)));
5203                 }
5204                 else if (strcmp(item->name, "standby_mode") == 0)
5205                 {
5206                         if (!parse_bool(item->value, &StandbyModeRequested))
5207                                 ereport(ERROR,
5208                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5209                                                  errmsg("parameter \"%s\" requires a Boolean value",
5210                                                                 "standby_mode")));
5211                         ereport(DEBUG2,
5212                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5213                 }
5214                 else if (strcmp(item->name, "primary_conninfo") == 0)
5215                 {
5216                         PrimaryConnInfo = pstrdup(item->value);
5217                         ereport(DEBUG2,
5218                                         (errmsg_internal("primary_conninfo = '%s'",
5219                                                                          PrimaryConnInfo)));
5220                 }
5221                 else if (strcmp(item->name, "primary_slotname") == 0)
5222                 {
5223                         ReplicationSlotValidateName(item->value, ERROR);
5224                         PrimarySlotName = pstrdup(item->value);
5225                         ereport(DEBUG2,
5226                                         (errmsg_internal("primary_slotname = '%s'",
5227                                                                          PrimarySlotName)));
5228                 }
5229                 else if (strcmp(item->name, "trigger_file") == 0)
5230                 {
5231                         TriggerFile = pstrdup(item->value);
5232                         ereport(DEBUG2,
5233                                         (errmsg_internal("trigger_file = '%s'",
5234                                                                          TriggerFile)));
5235                 }
5236                 else if (strcmp(item->name, "min_recovery_apply_delay") == 0)
5237                 {
5238                         const char *hintmsg;
5239
5240                         if (!parse_int(item->value, &min_recovery_apply_delay, GUC_UNIT_MS,
5241                                         &hintmsg))
5242                                 ereport(ERROR,
5243                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5244                                                  errmsg("parameter \"%s\" requires a temporal value", "min_recovery_apply_delay"),
5245                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
5246                         ereport(DEBUG2,
5247                                         (errmsg("min_recovery_apply_delay = '%s'", item->value)));
5248                 }
5249                 else
5250                         ereport(FATAL,
5251                                         (errmsg("unrecognized recovery parameter \"%s\"",
5252                                                         item->name)));
5253         }
5254
5255         /*
5256          * Check for compulsory parameters
5257          */
5258         if (StandbyModeRequested)
5259         {
5260                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5261                         ereport(WARNING,
5262                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5263                                                         RECOVERY_COMMAND_FILE),
5264                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5265         }
5266         else
5267         {
5268                 if (recoveryRestoreCommand == NULL)
5269                         ereport(FATAL,
5270                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5271                                                         RECOVERY_COMMAND_FILE)));
5272         }
5273
5274         /* Enable fetching from archive recovery area */
5275         ArchiveRecoveryRequested = true;
5276
5277         /*
5278          * If user specified recovery_target_timeline, validate it or compute the
5279          * "latest" value.      We can't do this until after we've gotten the restore
5280          * command and set InArchiveRecovery, because we need to fetch timeline
5281          * history files from the archive.
5282          */
5283         if (rtliGiven)
5284         {
5285                 if (rtli)
5286                 {
5287                         /* Timeline 1 does not have a history file, all else should */
5288                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5289                                 ereport(FATAL,
5290                                                 (errmsg("recovery target timeline %u does not exist",
5291                                                                 rtli)));
5292                         recoveryTargetTLI = rtli;
5293                         recoveryTargetIsLatest = false;
5294                 }
5295                 else
5296                 {
5297                         /* We start the "latest" search from pg_control's timeline */
5298                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5299                         recoveryTargetIsLatest = true;
5300                 }
5301         }
5302
5303         FreeConfigVariables(head);
5304 }
5305
5306 /*
5307  * Exit archive-recovery state
5308  */
5309 static void
5310 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
5311 {
5312         char            recoveryPath[MAXPGPATH];
5313         char            xlogpath[MAXPGPATH];
5314
5315         /*
5316          * We are no longer in archive recovery state.
5317          */
5318         InArchiveRecovery = false;
5319
5320         /*
5321          * Update min recovery point one last time.
5322          */
5323         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5324
5325         /*
5326          * If the ending log segment is still open, close it (to avoid problems on
5327          * Windows with trying to rename or delete an open file).
5328          */
5329         if (readFile >= 0)
5330         {
5331                 close(readFile);
5332                 readFile = -1;
5333         }
5334
5335         /*
5336          * If we are establishing a new timeline, we have to copy data from the
5337          * last WAL segment of the old timeline to create a starting WAL segment
5338          * for the new timeline.
5339          *
5340          * Notify the archiver that the last WAL segment of the old timeline is
5341          * ready to copy to archival storage. Otherwise, it is not archived for a
5342          * while.
5343          */
5344         if (endTLI != ThisTimeLineID)
5345         {
5346                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
5347
5348                 if (XLogArchivingActive())
5349                 {
5350                         XLogFileName(xlogpath, endTLI, endLogSegNo);
5351                         XLogArchiveNotify(xlogpath);
5352                 }
5353         }
5354
5355         /*
5356          * Let's just make real sure there are not .ready or .done flags posted
5357          * for the new segment.
5358          */
5359         XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
5360         XLogArchiveCleanup(xlogpath);
5361
5362         /*
5363          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5364          * of it.
5365          */
5366         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5367         unlink(recoveryPath);           /* ignore any error */
5368
5369         /* Get rid of any remaining recovered timeline-history file, too */
5370         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5371         unlink(recoveryPath);           /* ignore any error */
5372
5373         /*
5374          * Rename the config file out of the way, so that we don't accidentally
5375          * re-enter archive recovery mode in a subsequent crash.
5376          */
5377         unlink(RECOVERY_COMMAND_DONE);
5378         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5379                 ereport(FATAL,
5380                                 (errcode_for_file_access(),
5381                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5382                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5383
5384         ereport(LOG,
5385                         (errmsg("archive recovery complete")));
5386 }
5387
5388 /*
5389  * Extract timestamp from WAL record.
5390  *
5391  * If the record contains a timestamp, returns true, and saves the timestamp
5392  * in *recordXtime. If the record type has no timestamp, returns false.
5393  * Currently, only transaction commit/abort records and restore points contain
5394  * timestamps.
5395  */
5396 static bool
5397 getRecordTimestamp(XLogRecord *record, TimestampTz *recordXtime)
5398 {
5399         uint8           record_info = record->xl_info & ~XLR_INFO_MASK;
5400
5401         if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5402         {
5403                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5404                 return true;
5405         }
5406         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5407         {
5408                 *recordXtime = ((xl_xact_commit_compact *) XLogRecGetData(record))->xact_time;
5409                 return true;
5410         }
5411         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5412         {
5413                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5414                 return true;
5415         }
5416         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5417         {
5418                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5419                 return true;
5420         }
5421         return false;
5422 }
5423
5424 /*
5425  * For point-in-time recovery, this function decides whether we want to
5426  * stop applying the XLOG before the current record.
5427  *
5428  * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5429  * information is saved in recoveryStopXid et al for use in annotating the
5430  * new timeline's history file.
5431  */
5432 static bool
5433 recoveryStopsBefore(XLogRecord *record)
5434 {
5435         bool            stopsHere = false;
5436         uint8           record_info;
5437         bool            isCommit;
5438         TimestampTz recordXtime = 0;
5439
5440         /* Check if we should stop as soon as reaching consistency */
5441         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5442         {
5443                 ereport(LOG,
5444                                 (errmsg("recovery stopping after reaching consistency")));
5445
5446                 recoveryStopAfter = false;
5447                 recoveryStopXid = InvalidTransactionId;
5448                 recoveryStopTime = 0;
5449                 recoveryStopName[0] = '\0';
5450                 return true;
5451         }
5452
5453         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5454         if (record->xl_rmid != RM_XACT_ID)
5455                 return false;
5456         record_info = record->xl_info & ~XLR_INFO_MASK;
5457         if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5458                 isCommit = true;
5459         else if (record_info == XLOG_XACT_ABORT)
5460                 isCommit = false;
5461         else
5462                 return false;
5463
5464         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5465         {
5466                 /*
5467                  * There can be only one transaction end record with this exact
5468                  * transactionid
5469                  *
5470                  * when testing for an xid, we MUST test for equality only, since
5471                  * transactions are numbered in the order they start, not the order
5472                  * they complete. A higher numbered xid will complete before you
5473                  * about 50% of the time...
5474                  */
5475                 stopsHere = (record->xl_xid == recoveryTargetXid);
5476         }
5477
5478         if (recoveryTarget == RECOVERY_TARGET_TIME &&
5479                 getRecordTimestamp(record, &recordXtime))
5480         {
5481                 /*
5482                  * There can be many transactions that share the same commit time, so
5483                  * we stop after the last one, if we are inclusive, or stop at the
5484                  * first one if we are exclusive
5485                  */
5486                 if (recoveryTargetInclusive)
5487                         stopsHere = (recordXtime > recoveryTargetTime);
5488                 else
5489                         stopsHere = (recordXtime >= recoveryTargetTime);
5490         }
5491
5492         if (stopsHere)
5493         {
5494                 recoveryStopAfter = false;
5495                 recoveryStopXid = record->xl_xid;
5496                 recoveryStopTime = recordXtime;
5497                 recoveryStopName[0] = '\0';
5498
5499                 if (isCommit)
5500                 {
5501                         ereport(LOG,
5502                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
5503                                                         recoveryStopXid,
5504                                                         timestamptz_to_str(recoveryStopTime))));
5505                 }
5506                 else
5507                 {
5508                         ereport(LOG,
5509                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
5510                                                         recoveryStopXid,
5511                                                         timestamptz_to_str(recoveryStopTime))));
5512                 }
5513         }
5514
5515         return stopsHere;
5516 }
5517
5518 /*
5519  * Same as recoveryStopsBefore, but called after applying the record.
5520  *
5521  * We also track the timestamp of the latest applied COMMIT/ABORT
5522  * record in XLogCtl->recoveryLastXTime.
5523  */
5524 static bool
5525 recoveryStopsAfter(XLogRecord *record)
5526 {
5527         uint8           record_info;
5528         TimestampTz recordXtime;
5529
5530         record_info = record->xl_info & ~XLR_INFO_MASK;
5531
5532         /*
5533          * There can be many restore points that share the same name; we stop
5534          * at the first one.
5535          */
5536         if (recoveryTarget == RECOVERY_TARGET_NAME &&
5537                 record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5538         {
5539                 xl_restore_point *recordRestorePointData;
5540
5541                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5542
5543                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5544                 {
5545                         recoveryStopAfter = true;
5546                         recoveryStopXid = InvalidTransactionId;
5547                         (void) getRecordTimestamp(record, &recoveryStopTime);
5548                         strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5549
5550                         ereport(LOG,
5551                                         (errmsg("recovery stopping at restore point \"%s\", time %s",
5552                                                         recoveryStopName,
5553                                                         timestamptz_to_str(recoveryStopTime))));
5554                         return true;
5555                 }
5556         }
5557
5558         if (record->xl_rmid == RM_XACT_ID &&
5559                 (record_info == XLOG_XACT_COMMIT_COMPACT ||
5560                  record_info == XLOG_XACT_COMMIT ||
5561                  record_info == XLOG_XACT_ABORT))
5562         {
5563                 /* Update the last applied transaction timestamp */
5564                 if (getRecordTimestamp(record, &recordXtime))
5565                         SetLatestXTime(recordXtime);
5566
5567                 /*
5568                  * There can be only one transaction end record with this exact
5569                  * transactionid
5570                  *
5571                  * when testing for an xid, we MUST test for equality only, since
5572                  * transactions are numbered in the order they start, not the order
5573                  * they complete. A higher numbered xid will complete before you about
5574                  * 50% of the time...
5575                  */
5576                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5577                         record->xl_xid == recoveryTargetXid)
5578                 {
5579                         recoveryStopAfter = true;
5580                         recoveryStopXid = record->xl_xid;
5581                         recoveryStopTime = recordXtime;
5582                         recoveryStopName[0] = '\0';
5583
5584                         if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5585                         {
5586                                 ereport(LOG,
5587                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5588                                                                 recoveryStopXid,
5589                                                                 timestamptz_to_str(recoveryStopTime))));
5590                         }
5591                         else if (record_info == XLOG_XACT_ABORT)
5592                         {
5593                                 ereport(LOG,
5594                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5595                                                                 recoveryStopXid,
5596                                                                 timestamptz_to_str(recoveryStopTime))));
5597                         }
5598                         return true;
5599                 }
5600         }
5601
5602         /* Check if we should stop as soon as reaching consistency */
5603         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5604         {
5605                 ereport(LOG,
5606                                 (errmsg("recovery stopping after reaching consistency")));
5607
5608                 recoveryStopAfter = true;
5609                 recoveryStopXid = InvalidTransactionId;
5610                 recoveryStopTime = 0;
5611                 recoveryStopName[0] = '\0';
5612                 return true;
5613         }
5614
5615         return false;
5616 }
5617
5618 /*
5619  * Wait until shared recoveryPause flag is cleared.
5620  *
5621  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5622  * Probably not worth the trouble though.  This state shouldn't be one that
5623  * anyone cares about server power consumption in.
5624  */
5625 static void
5626 recoveryPausesHere(void)
5627 {
5628         /* Don't pause unless users can connect! */
5629         if (!LocalHotStandbyActive)
5630                 return;
5631
5632         ereport(LOG,
5633                         (errmsg("recovery has paused"),
5634                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5635
5636         while (RecoveryIsPaused())
5637         {
5638                 pg_usleep(1000000L);    /* 1000 ms */
5639                 HandleStartupProcInterrupts();
5640         }
5641 }
5642
5643 bool
5644 RecoveryIsPaused(void)
5645 {
5646         /* use volatile pointer to prevent code rearrangement */
5647         volatile XLogCtlData *xlogctl = XLogCtl;
5648         bool            recoveryPause;
5649
5650         SpinLockAcquire(&xlogctl->info_lck);
5651         recoveryPause = xlogctl->recoveryPause;
5652         SpinLockRelease(&xlogctl->info_lck);
5653
5654         return recoveryPause;
5655 }
5656
5657 void
5658 SetRecoveryPause(bool recoveryPause)
5659 {
5660         /* use volatile pointer to prevent code rearrangement */
5661         volatile XLogCtlData *xlogctl = XLogCtl;
5662
5663         SpinLockAcquire(&xlogctl->info_lck);
5664         xlogctl->recoveryPause = recoveryPause;
5665         SpinLockRelease(&xlogctl->info_lck);
5666 }
5667
5668 /*
5669  * When min_recovery_apply_delay is set, we wait long enough to make sure
5670  * certain record types are applied at least that interval behind the master.
5671  *
5672  * Returns true if we waited.
5673  *
5674  * Note that the delay is calculated between the WAL record log time and
5675  * the current time on standby. We would prefer to keep track of when this
5676  * standby received each WAL record, which would allow a more consistent
5677  * approach and one not affected by time synchronisation issues, but that
5678  * is significantly more effort and complexity for little actual gain in
5679  * usability.
5680  */
5681 static bool
5682 recoveryApplyDelay(XLogRecord *record)
5683 {
5684         uint8           record_info;
5685         TimestampTz xtime;
5686         long            secs;
5687         int                     microsecs;
5688
5689         /* nothing to do if no delay configured */
5690         if (min_recovery_apply_delay == 0)
5691                 return false;
5692
5693         /*
5694          * Is it a COMMIT record?
5695          *
5696          * We deliberately choose not to delay aborts since they have no effect
5697          * on MVCC. We already allow replay of records that don't have a
5698          * timestamp, so there is already opportunity for issues caused by early
5699          * conflicts on standbys.
5700          */
5701         record_info = record->xl_info & ~XLR_INFO_MASK;
5702         if (!(record->xl_rmid == RM_XACT_ID &&
5703                   (record_info == XLOG_XACT_COMMIT_COMPACT ||
5704                    record_info == XLOG_XACT_COMMIT)))
5705                 return false;
5706
5707         if (!getRecordTimestamp(record, &xtime))
5708                 return false;
5709
5710         recoveryDelayUntilTime =
5711                 TimestampTzPlusMilliseconds(xtime, min_recovery_apply_delay);
5712
5713         /*
5714          * Exit without arming the latch if it's already past time to apply this
5715          * record
5716          */
5717         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
5718                                                 &secs, &microsecs);
5719         if (secs <= 0 && microsecs <=0)
5720                 return false;
5721
5722         while (true)
5723         {
5724                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
5725
5726                 /* might change the trigger file's location */
5727                 HandleStartupProcInterrupts();
5728
5729                 if (CheckForStandbyTrigger())
5730                         break;
5731
5732                 /*
5733                  * Wait for difference between GetCurrentTimestamp() and
5734                  * recoveryDelayUntilTime
5735                  */
5736                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
5737                                                         &secs, &microsecs);
5738
5739                 if (secs <= 0 && microsecs <=0)
5740                         break;
5741
5742                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
5743                         secs, microsecs / 1000);
5744
5745                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
5746                                         WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
5747                                         secs * 1000L + microsecs / 1000);
5748         }
5749         return true;
5750 }
5751
5752 /*
5753  * Save timestamp of latest processed commit/abort record.
5754  *
5755  * We keep this in XLogCtl, not a simple static variable, so that it can be
5756  * seen by processes other than the startup process.  Note in particular
5757  * that CreateRestartPoint is executed in the checkpointer.
5758  */
5759 static void
5760 SetLatestXTime(TimestampTz xtime)
5761 {
5762         /* use volatile pointer to prevent code rearrangement */
5763         volatile XLogCtlData *xlogctl = XLogCtl;
5764
5765         SpinLockAcquire(&xlogctl->info_lck);
5766         xlogctl->recoveryLastXTime = xtime;
5767         SpinLockRelease(&xlogctl->info_lck);
5768 }
5769
5770 /*
5771  * Fetch timestamp of latest processed commit/abort record.
5772  */
5773 TimestampTz
5774 GetLatestXTime(void)
5775 {
5776         /* use volatile pointer to prevent code rearrangement */
5777         volatile XLogCtlData *xlogctl = XLogCtl;
5778         TimestampTz xtime;
5779
5780         SpinLockAcquire(&xlogctl->info_lck);
5781         xtime = xlogctl->recoveryLastXTime;
5782         SpinLockRelease(&xlogctl->info_lck);
5783
5784         return xtime;
5785 }
5786
5787 /*
5788  * Save timestamp of the next chunk of WAL records to apply.
5789  *
5790  * We keep this in XLogCtl, not a simple static variable, so that it can be
5791  * seen by all backends.
5792  */
5793 static void
5794 SetCurrentChunkStartTime(TimestampTz xtime)
5795 {
5796         /* use volatile pointer to prevent code rearrangement */
5797         volatile XLogCtlData *xlogctl = XLogCtl;
5798
5799         SpinLockAcquire(&xlogctl->info_lck);
5800         xlogctl->currentChunkStartTime = xtime;
5801         SpinLockRelease(&xlogctl->info_lck);
5802 }
5803
5804 /*
5805  * Fetch timestamp of latest processed commit/abort record.
5806  * Startup process maintains an accurate local copy in XLogReceiptTime
5807  */
5808 TimestampTz
5809 GetCurrentChunkReplayStartTime(void)
5810 {
5811         /* use volatile pointer to prevent code rearrangement */
5812         volatile XLogCtlData *xlogctl = XLogCtl;
5813         TimestampTz xtime;
5814
5815         SpinLockAcquire(&xlogctl->info_lck);
5816         xtime = xlogctl->currentChunkStartTime;
5817         SpinLockRelease(&xlogctl->info_lck);
5818
5819         return xtime;
5820 }
5821
5822 /*
5823  * Returns time of receipt of current chunk of XLOG data, as well as
5824  * whether it was received from streaming replication or from archives.
5825  */
5826 void
5827 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
5828 {
5829         /*
5830          * This must be executed in the startup process, since we don't export the
5831          * relevant state to shared memory.
5832          */
5833         Assert(InRecovery);
5834
5835         *rtime = XLogReceiptTime;
5836         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
5837 }
5838
5839 /*
5840  * Note that text field supplied is a parameter name and does not require
5841  * translation
5842  */
5843 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5844 do { \
5845         if ((currValue) < (minValue)) \
5846                 ereport(ERROR, \
5847                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5848                                  errmsg("hot standby is not possible because " \
5849                                                 "%s = %d is a lower setting than on the master server " \
5850                                                 "(its value was %d)", \
5851                                                 param_name, \
5852                                                 currValue, \
5853                                                 minValue))); \
5854 } while(0)
5855
5856 /*
5857  * Check to see if required parameters are set high enough on this server
5858  * for various aspects of recovery operation.
5859  */
5860 static void
5861 CheckRequiredParameterValues(void)
5862 {
5863         /*
5864          * For archive recovery, the WAL must be generated with at least 'archive'
5865          * wal_level.
5866          */
5867         if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5868         {
5869                 ereport(WARNING,
5870                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
5871                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5872         }
5873
5874         /*
5875          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
5876          * we must have at least as many backend slots as the primary.
5877          */
5878         if (ArchiveRecoveryRequested && EnableHotStandby)
5879         {
5880                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
5881                         ereport(ERROR,
5882                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on the master server"),
5883                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5884
5885                 /* We ignore autovacuum_max_workers when we make this test. */
5886                 RecoveryRequiresIntParameter("max_connections",
5887                                                                          MaxConnections,
5888                                                                          ControlFile->MaxConnections);
5889                 RecoveryRequiresIntParameter("max_worker_processes",
5890                                                                          max_worker_processes,
5891                                                                          ControlFile->max_worker_processes);
5892                 RecoveryRequiresIntParameter("max_prepared_transactions",
5893                                                                          max_prepared_xacts,
5894                                                                          ControlFile->max_prepared_xacts);
5895                 RecoveryRequiresIntParameter("max_locks_per_transaction",
5896                                                                          max_locks_per_xact,
5897                                                                          ControlFile->max_locks_per_xact);
5898         }
5899 }
5900
5901 /*
5902  * This must be called ONCE during postmaster or standalone-backend startup
5903  */
5904 void
5905 StartupXLOG(void)
5906 {
5907         XLogCtlInsert *Insert;
5908         CheckPoint      checkPoint;
5909         bool            wasShutdown;
5910         bool            reachedStopPoint = false;
5911         bool            haveBackupLabel = false;
5912         XLogRecPtr      RecPtr,
5913                                 checkPointLoc,
5914                                 EndOfLog;
5915         XLogSegNo       endLogSegNo;
5916         TimeLineID      PrevTimeLineID;
5917         XLogRecord *record;
5918         TransactionId oldestActiveXID;
5919         bool            backupEndRequired = false;
5920         bool            backupFromStandby = false;
5921         DBState         dbstate_at_startup;
5922         XLogReaderState *xlogreader;
5923         XLogPageReadPrivate private;
5924         bool            fast_promoted = false;
5925
5926         /*
5927          * Read control file and check XLOG status looks valid.
5928          *
5929          * Note: in most control paths, *ControlFile is already valid and we need
5930          * not do ReadControlFile() here, but might as well do it to be sure.
5931          */
5932         ReadControlFile();
5933
5934         if (ControlFile->state < DB_SHUTDOWNED ||
5935                 ControlFile->state > DB_IN_PRODUCTION ||
5936                 !XRecOffIsValid(ControlFile->checkPoint))
5937                 ereport(FATAL,
5938                                 (errmsg("control file contains invalid data")));
5939
5940         if (ControlFile->state == DB_SHUTDOWNED)
5941         {
5942                 /* This is the expected case, so don't be chatty in standalone mode */
5943                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
5944                                 (errmsg("database system was shut down at %s",
5945                                                 str_time(ControlFile->time))));
5946         }
5947         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
5948                 ereport(LOG,
5949                                 (errmsg("database system was shut down in recovery at %s",
5950                                                 str_time(ControlFile->time))));
5951         else if (ControlFile->state == DB_SHUTDOWNING)
5952                 ereport(LOG,
5953                                 (errmsg("database system shutdown was interrupted; last known up at %s",
5954                                                 str_time(ControlFile->time))));
5955         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5956                 ereport(LOG,
5957                    (errmsg("database system was interrupted while in recovery at %s",
5958                                    str_time(ControlFile->time)),
5959                         errhint("This probably means that some data is corrupted and"
5960                                         " you will have to use the last backup for recovery.")));
5961         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
5962                 ereport(LOG,
5963                                 (errmsg("database system was interrupted while in recovery at log time %s",
5964                                                 str_time(ControlFile->checkPointCopy.time)),
5965                                  errhint("If this has occurred more than once some data might be corrupted"
5966                           " and you might need to choose an earlier recovery target.")));
5967         else if (ControlFile->state == DB_IN_PRODUCTION)
5968                 ereport(LOG,
5969                           (errmsg("database system was interrupted; last known up at %s",
5970                                           str_time(ControlFile->time))));
5971
5972         /* This is just to allow attaching to startup process with a debugger */
5973 #ifdef XLOG_REPLAY_DELAY
5974         if (ControlFile->state != DB_SHUTDOWNED)
5975                 pg_usleep(60000000L);
5976 #endif
5977
5978         /*
5979          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
5980          * someone has performed a copy for PITR, these directories may have been
5981          * excluded and need to be re-created.
5982          */
5983         ValidateXLOGDirectoryStructure();
5984
5985         /*
5986          * Clear out any old relcache cache files.      This is *necessary* if we do
5987          * any WAL replay, since that would probably result in the cache files
5988          * being out of sync with database reality.  In theory we could leave them
5989          * in place if the database had been cleanly shut down, but it seems
5990          * safest to just remove them always and let them be rebuilt during the
5991          * first backend startup.
5992          */
5993         RelationCacheInitFileRemove();
5994
5995         /*
5996          * Initialize on the assumption we want to recover to the latest timeline
5997          * that's active according to pg_control.
5998          */
5999         if (ControlFile->minRecoveryPointTLI >
6000                 ControlFile->checkPointCopy.ThisTimeLineID)
6001                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6002         else
6003                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6004
6005         /*
6006          * Check for recovery control file, and if so set up state for offline
6007          * recovery
6008          */
6009         readRecoveryCommandFile();
6010
6011         /*
6012          * Save archive_cleanup_command in shared memory so that other processes
6013          * can see it.
6014          */
6015         strlcpy(XLogCtl->archiveCleanupCommand,
6016                         archiveCleanupCommand ? archiveCleanupCommand : "",
6017                         sizeof(XLogCtl->archiveCleanupCommand));
6018
6019         if (ArchiveRecoveryRequested)
6020         {
6021                 if (StandbyModeRequested)
6022                         ereport(LOG,
6023                                         (errmsg("entering standby mode")));
6024                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6025                         ereport(LOG,
6026                                         (errmsg("starting point-in-time recovery to XID %u",
6027                                                         recoveryTargetXid)));
6028                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6029                         ereport(LOG,
6030                                         (errmsg("starting point-in-time recovery to %s",
6031                                                         timestamptz_to_str(recoveryTargetTime))));
6032                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6033                         ereport(LOG,
6034                                         (errmsg("starting point-in-time recovery to \"%s\"",
6035                                                         recoveryTargetName)));
6036                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6037                         ereport(LOG,
6038                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
6039                 else
6040                         ereport(LOG,
6041                                         (errmsg("starting archive recovery")));
6042         }
6043
6044         /*
6045          * Take ownership of the wakeup latch if we're going to sleep during
6046          * recovery.
6047          */
6048         if (StandbyModeRequested)
6049                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6050
6051         /* Set up XLOG reader facility */
6052         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6053         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6054         if (!xlogreader)
6055                 ereport(ERROR,
6056                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6057                                  errmsg("out of memory"),
6058                         errdetail("Failed while allocating an XLog reading processor.")));
6059         xlogreader->system_identifier = ControlFile->system_identifier;
6060
6061         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6062                                                   &backupFromStandby))
6063         {
6064                 /*
6065                  * Archive recovery was requested, and thanks to the backup label
6066                  * file, we know how far we need to replay to reach consistency. Enter
6067                  * archive recovery directly.
6068                  */
6069                 InArchiveRecovery = true;
6070                 if (StandbyModeRequested)
6071                         StandbyMode = true;
6072
6073                 /*
6074                  * When a backup_label file is present, we want to roll forward from
6075                  * the checkpoint it identifies, rather than using pg_control.
6076                  */
6077                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6078                 if (record != NULL)
6079                 {
6080                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6081                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6082                         ereport(DEBUG1,
6083                                         (errmsg("checkpoint record is at %X/%X",
6084                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6085                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6086
6087                         /*
6088                          * Make sure that REDO location exists. This may not be the case
6089                          * if there was a crash during an online backup, which left a
6090                          * backup_label around that references a WAL segment that's
6091                          * already been archived.
6092                          */
6093                         if (checkPoint.redo < checkPointLoc)
6094                         {
6095                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6096                                         ereport(FATAL,
6097                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6098                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6099                         }
6100                 }
6101                 else
6102                 {
6103                         ereport(FATAL,
6104                                         (errmsg("could not locate required checkpoint record"),
6105                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6106                         wasShutdown = false;    /* keep compiler quiet */
6107                 }
6108                 /* set flag to delete it later */
6109                 haveBackupLabel = true;
6110         }
6111         else
6112         {
6113                 /*
6114                  * It's possible that archive recovery was requested, but we don't
6115                  * know how far we need to replay the WAL before we reach consistency.
6116                  * This can happen for example if a base backup is taken from a
6117                  * running server using an atomic filesystem snapshot, without calling
6118                  * pg_start/stop_backup. Or if you just kill a running master server
6119                  * and put it into archive recovery by creating a recovery.conf file.
6120                  *
6121                  * Our strategy in that case is to perform crash recovery first,
6122                  * replaying all the WAL present in pg_xlog, and only enter archive
6123                  * recovery after that.
6124                  *
6125                  * But usually we already know how far we need to replay the WAL (up
6126                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6127                  * end-of-backup record), and we can enter archive recovery directly.
6128                  */
6129                 if (ArchiveRecoveryRequested &&
6130                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6131                          ControlFile->backupEndRequired ||
6132                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6133                          ControlFile->state == DB_SHUTDOWNED))
6134                 {
6135                         InArchiveRecovery = true;
6136                         if (StandbyModeRequested)
6137                                 StandbyMode = true;
6138                 }
6139
6140                 /*
6141                  * Get the last valid checkpoint record.  If the latest one according
6142                  * to pg_control is broken, try the next-to-last one.
6143                  */
6144                 checkPointLoc = ControlFile->checkPoint;
6145                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6146                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6147                 if (record != NULL)
6148                 {
6149                         ereport(DEBUG1,
6150                                         (errmsg("checkpoint record is at %X/%X",
6151                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6152                 }
6153                 else if (StandbyMode)
6154                 {
6155                         /*
6156                          * The last valid checkpoint record required for a streaming
6157                          * recovery exists in neither standby nor the primary.
6158                          */
6159                         ereport(PANIC,
6160                                         (errmsg("could not locate a valid checkpoint record")));
6161                 }
6162                 else
6163                 {
6164                         checkPointLoc = ControlFile->prevCheckPoint;
6165                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6166                         if (record != NULL)
6167                         {
6168                                 ereport(LOG,
6169                                                 (errmsg("using previous checkpoint record at %X/%X",
6170                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6171                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6172                         }
6173                         else
6174                                 ereport(PANIC,
6175                                          (errmsg("could not locate a valid checkpoint record")));
6176                 }
6177                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6178                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6179         }
6180
6181         /*
6182          * If the location of the checkpoint record is not on the expected
6183          * timeline in the history of the requested timeline, we cannot proceed:
6184          * the backup is not part of the history of the requested timeline.
6185          */
6186         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6187                                                                  * record */
6188         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6189                 checkPoint.ThisTimeLineID)
6190         {
6191                 XLogRecPtr      switchpoint;
6192
6193                 /*
6194                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6195                  * not in expectedTLEs at all.
6196                  */
6197                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6198                 ereport(FATAL,
6199                                 (errmsg("requested timeline %u is not a child of this server's history",
6200                                                 recoveryTargetTLI),
6201                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6202                                                    (uint32) (ControlFile->checkPoint >> 32),
6203                                                    (uint32) ControlFile->checkPoint,
6204                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6205                                                    (uint32) (switchpoint >> 32),
6206                                                    (uint32) switchpoint)));
6207         }
6208
6209         /*
6210          * The min recovery point should be part of the requested timeline's
6211          * history, too.
6212          */
6213         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6214           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6215                 ControlFile->minRecoveryPointTLI)
6216                 ereport(FATAL,
6217                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6218                                                 recoveryTargetTLI,
6219                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6220                                                 (uint32) ControlFile->minRecoveryPoint,
6221                                                 ControlFile->minRecoveryPointTLI)));
6222
6223         LastRec = RecPtr = checkPointLoc;
6224
6225         ereport(DEBUG1,
6226                         (errmsg("redo record is at %X/%X; shutdown %s",
6227                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6228                                         wasShutdown ? "TRUE" : "FALSE")));
6229         ereport(DEBUG1,
6230                         (errmsg("next transaction ID: %u/%u; next OID: %u",
6231                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
6232                                         checkPoint.nextOid)));
6233         ereport(DEBUG1,
6234                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6235                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6236         ereport(DEBUG1,
6237                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6238                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
6239         ereport(DEBUG1,
6240                         (errmsg("oldest MultiXactId: %u, in database %u",
6241                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6242         if (!TransactionIdIsNormal(checkPoint.nextXid))
6243                 ereport(PANIC,
6244                                 (errmsg("invalid next transaction ID")));
6245
6246         /* initialize shared memory variables from the checkpoint record */
6247         ShmemVariableCache->nextXid = checkPoint.nextXid;
6248         ShmemVariableCache->nextOid = checkPoint.nextOid;
6249         ShmemVariableCache->oidCount = 0;
6250         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6251         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6252         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
6253         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6254         XLogCtl->ckptXid = checkPoint.nextXid;
6255
6256         /*
6257          * Initialize replication slots, before there's a chance to remove
6258          * required resources.
6259          */
6260         StartupReplicationSlots(checkPoint.redo);
6261
6262         /*
6263          * Startup logical state, needs to be setup now so we have proper data
6264          * during crash recovery.
6265          */
6266         StartupReorderBuffer();
6267
6268         /*
6269          * Startup MultiXact.  We need to do this early for two reasons: one
6270          * is that we might try to access multixacts when we do tuple freezing,
6271          * and the other is we need its state initialized because we attempt
6272          * truncation during restartpoints.
6273          */
6274         StartupMultiXact();
6275
6276         /*
6277          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6278          * control file. On recovery, all unlogged relations are blown away, so
6279          * the unlogged LSN counter can be reset too.
6280          */
6281         if (ControlFile->state == DB_SHUTDOWNED)
6282                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6283         else
6284                 XLogCtl->unloggedLSN = 1;
6285
6286         /*
6287          * We must replay WAL entries using the same TimeLineID they were created
6288          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6289          * also xlog_redo()).
6290          */
6291         ThisTimeLineID = checkPoint.ThisTimeLineID;
6292
6293         /*
6294          * Copy any missing timeline history files between 'now' and the recovery
6295          * target timeline from archive to pg_xlog. While we don't need those
6296          * files ourselves - the history file of the recovery target timeline
6297          * covers all the previous timelines in the history too - a cascading
6298          * standby server might be interested in them. Or, if you archive the WAL
6299          * from this server to a different archive than the master, it'd be good
6300          * for all the history files to get archived there after failover, so that
6301          * you can use one of the old timelines as a PITR target. Timeline history
6302          * files are small, so it's better to copy them unnecessarily than not
6303          * copy them and regret later.
6304          */
6305         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6306
6307         lastFullPageWrites = checkPoint.fullPageWrites;
6308
6309         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6310
6311         if (RecPtr < checkPoint.redo)
6312                 ereport(PANIC,
6313                                 (errmsg("invalid redo in checkpoint record")));
6314
6315         /*
6316          * Check whether we need to force recovery from WAL.  If it appears to
6317          * have been a clean shutdown and we did not have a recovery.conf file,
6318          * then assume no recovery needed.
6319          */
6320         if (checkPoint.redo < RecPtr)
6321         {
6322                 if (wasShutdown)
6323                         ereport(PANIC,
6324                                         (errmsg("invalid redo record in shutdown checkpoint")));
6325                 InRecovery = true;
6326         }
6327         else if (ControlFile->state != DB_SHUTDOWNED)
6328                 InRecovery = true;
6329         else if (ArchiveRecoveryRequested)
6330         {
6331                 /* force recovery due to presence of recovery.conf */
6332                 InRecovery = true;
6333         }
6334
6335         /* REDO */
6336         if (InRecovery)
6337         {
6338                 int                     rmid;
6339
6340                 /* use volatile pointer to prevent code rearrangement */
6341                 volatile XLogCtlData *xlogctl = XLogCtl;
6342
6343                 /*
6344                  * Update pg_control to show that we are recovering and to show the
6345                  * selected checkpoint as the place we are starting from. We also mark
6346                  * pg_control with any minimum recovery stop point obtained from a
6347                  * backup history file.
6348                  */
6349                 dbstate_at_startup = ControlFile->state;
6350                 if (InArchiveRecovery)
6351                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6352                 else
6353                 {
6354                         ereport(LOG,
6355                                         (errmsg("database system was not properly shut down; "
6356                                                         "automatic recovery in progress")));
6357                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6358                                 ereport(LOG,
6359                                                 (errmsg("crash recovery starts in timeline %u "
6360                                                                 "and has target timeline %u",
6361                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6362                                                                 recoveryTargetTLI)));
6363                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6364                 }
6365                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6366                 ControlFile->checkPoint = checkPointLoc;
6367                 ControlFile->checkPointCopy = checkPoint;
6368                 if (InArchiveRecovery)
6369                 {
6370                         /* initialize minRecoveryPoint if not set yet */
6371                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6372                         {
6373                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6374                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6375                         }
6376                 }
6377
6378                 /*
6379                  * Set backupStartPoint if we're starting recovery from a base backup.
6380                  *
6381                  * Set backupEndPoint and use minRecoveryPoint as the backup end
6382                  * location if we're starting recovery from a base backup which was
6383                  * taken from the standby. In this case, the database system status in
6384                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
6385                  * means that backup is corrupted, so we cancel recovery.
6386                  */
6387                 if (haveBackupLabel)
6388                 {
6389                         ControlFile->backupStartPoint = checkPoint.redo;
6390                         ControlFile->backupEndRequired = backupEndRequired;
6391
6392                         if (backupFromStandby)
6393                         {
6394                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6395                                         ereport(FATAL,
6396                                                         (errmsg("backup_label contains data inconsistent with control file"),
6397                                                          errhint("This means that the backup is corrupted and you will "
6398                                                            "have to use another backup for recovery.")));
6399                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6400                         }
6401                 }
6402                 ControlFile->time = (pg_time_t) time(NULL);
6403                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6404                 UpdateControlFile();
6405
6406                 /* initialize our local copy of minRecoveryPoint */
6407                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6408                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6409
6410                 /*
6411                  * Reset pgstat data, because it may be invalid after recovery.
6412                  */
6413                 pgstat_reset_all();
6414
6415                 /*
6416                  * If there was a backup label file, it's done its job and the info
6417                  * has now been propagated into pg_control.  We must get rid of the
6418                  * label file so that if we crash during recovery, we'll pick up at
6419                  * the latest recovery restartpoint instead of going all the way back
6420                  * to the backup start point.  It seems prudent though to just rename
6421                  * the file out of the way rather than delete it completely.
6422                  */
6423                 if (haveBackupLabel)
6424                 {
6425                         unlink(BACKUP_LABEL_OLD);
6426                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6427                                 ereport(FATAL,
6428                                                 (errcode_for_file_access(),
6429                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6430                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6431                 }
6432
6433                 /* Check that the GUCs used to generate the WAL allow recovery */
6434                 CheckRequiredParameterValues();
6435
6436                 /*
6437                  * We're in recovery, so unlogged relations may be trashed and must be
6438                  * reset.  This should be done BEFORE allowing Hot Standby
6439                  * connections, so that read-only backends don't try to read whatever
6440                  * garbage is left over from before.
6441                  */
6442                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6443
6444                 /*
6445                  * Likewise, delete any saved transaction snapshot files that got left
6446                  * behind by crashed backends.
6447                  */
6448                 DeleteAllExportedSnapshotFiles();
6449
6450                 /*
6451                  * Initialize for Hot Standby, if enabled. We won't let backends in
6452                  * yet, not until we've reached the min recovery point specified in
6453                  * control file and we've established a recovery snapshot from a
6454                  * running-xacts WAL record.
6455                  */
6456                 if (ArchiveRecoveryRequested && EnableHotStandby)
6457                 {
6458                         TransactionId *xids;
6459                         int                     nxids;
6460
6461                         ereport(DEBUG1,
6462                                         (errmsg("initializing for hot standby")));
6463
6464                         InitRecoveryTransactionEnvironment();
6465
6466                         if (wasShutdown)
6467                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6468                         else
6469                                 oldestActiveXID = checkPoint.oldestActiveXid;
6470                         Assert(TransactionIdIsValid(oldestActiveXID));
6471
6472                         /* Tell procarray about the range of xids it has to deal with */
6473                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6474
6475                         /*
6476                          * Startup commit log and subtrans only. MultiXact has already
6477                          * been started up and other SLRUs are not maintained during
6478                          * recovery and need not be started yet.
6479                          */
6480                         StartupCLOG();
6481                         StartupSUBTRANS(oldestActiveXID);
6482
6483                         /*
6484                          * If we're beginning at a shutdown checkpoint, we know that
6485                          * nothing was running on the master at this point. So fake-up an
6486                          * empty running-xacts record and use that here and now. Recover
6487                          * additional standby state for prepared transactions.
6488                          */
6489                         if (wasShutdown)
6490                         {
6491                                 RunningTransactionsData running;
6492                                 TransactionId latestCompletedXid;
6493
6494                                 /*
6495                                  * Construct a RunningTransactions snapshot representing a
6496                                  * shut down server, with only prepared transactions still
6497                                  * alive. We're never overflowed at this point because all
6498                                  * subxids are listed with their parent prepared transactions.
6499                                  */
6500                                 running.xcnt = nxids;
6501                                 running.subxcnt = 0;
6502                                 running.subxid_overflow = false;
6503                                 running.nextXid = checkPoint.nextXid;
6504                                 running.oldestRunningXid = oldestActiveXID;
6505                                 latestCompletedXid = checkPoint.nextXid;
6506                                 TransactionIdRetreat(latestCompletedXid);
6507                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6508                                 running.latestCompletedXid = latestCompletedXid;
6509                                 running.xids = xids;
6510
6511                                 ProcArrayApplyRecoveryInfo(&running);
6512
6513                                 StandbyRecoverPreparedTransactions(false);
6514                         }
6515                 }
6516
6517                 /* Initialize resource managers */
6518                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6519                 {
6520                         if (RmgrTable[rmid].rm_startup != NULL)
6521                                 RmgrTable[rmid].rm_startup();
6522                 }
6523
6524                 /*
6525                  * Initialize shared variables for tracking progress of WAL replay,
6526                  * as if we had just replayed the record before the REDO location
6527                  * (or the checkpoint record itself, if it's a shutdown checkpoint).
6528                  */
6529                 SpinLockAcquire(&xlogctl->info_lck);
6530                 if (checkPoint.redo < RecPtr)
6531                         xlogctl->replayEndRecPtr = checkPoint.redo;
6532                 else
6533                         xlogctl->replayEndRecPtr = EndRecPtr;
6534                 xlogctl->replayEndTLI = ThisTimeLineID;
6535                 xlogctl->lastReplayedEndRecPtr = xlogctl->replayEndRecPtr;
6536                 xlogctl->lastReplayedTLI = xlogctl->replayEndTLI;
6537                 xlogctl->recoveryLastXTime = 0;
6538                 xlogctl->currentChunkStartTime = 0;
6539                 xlogctl->recoveryPause = false;
6540                 SpinLockRelease(&xlogctl->info_lck);
6541
6542                 /* Also ensure XLogReceiptTime has a sane value */
6543                 XLogReceiptTime = GetCurrentTimestamp();
6544
6545                 /*
6546                  * Let postmaster know we've started redo now, so that it can launch
6547                  * checkpointer to perform restartpoints.  We don't bother during
6548                  * crash recovery as restartpoints can only be performed during
6549                  * archive recovery.  And we'd like to keep crash recovery simple, to
6550                  * avoid introducing bugs that could affect you when recovering after
6551                  * crash.
6552                  *
6553                  * After this point, we can no longer assume that we're the only
6554                  * process in addition to postmaster!  Also, fsync requests are
6555                  * subsequently to be handled by the checkpointer, not locally.
6556                  */
6557                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6558                 {
6559                         PublishStartupProcessInformation();
6560                         SetForwardFsyncRequests();
6561                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6562                         bgwriterLaunched = true;
6563                 }
6564
6565                 /*
6566                  * Allow read-only connections immediately if we're consistent
6567                  * already.
6568                  */
6569                 CheckRecoveryConsistency();
6570
6571                 /*
6572                  * Find the first record that logically follows the checkpoint --- it
6573                  * might physically precede it, though.
6574                  */
6575                 if (checkPoint.redo < RecPtr)
6576                 {
6577                         /* back up to find the record */
6578                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6579                 }
6580                 else
6581                 {
6582                         /* just have to read next record after CheckPoint */
6583                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6584                 }
6585
6586                 if (record != NULL)
6587                 {
6588                         ErrorContextCallback errcallback;
6589                         TimestampTz xtime;
6590
6591                         InRedo = true;
6592
6593                         ereport(LOG,
6594                                         (errmsg("redo starts at %X/%X",
6595                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6596
6597                         /*
6598                          * main redo apply loop
6599                          */
6600                         do
6601                         {
6602                                 bool            switchedTLI = false;
6603
6604 #ifdef WAL_DEBUG
6605                                 if (XLOG_DEBUG ||
6606                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6607                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6608                                 {
6609                                         StringInfoData buf;
6610
6611                                         initStringInfo(&buf);
6612                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6613                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6614                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6615                                         xlog_outrec(&buf, record);
6616                                         appendStringInfoString(&buf, " - ");
6617                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
6618                                                                                                            record->xl_info,
6619                                                                                                          XLogRecGetData(record));
6620                                         elog(LOG, "%s", buf.data);
6621                                         pfree(buf.data);
6622                                 }
6623 #endif
6624
6625                                 /* Handle interrupt signals of startup process */
6626                                 HandleStartupProcInterrupts();
6627
6628                                 /*
6629                                  * Pause WAL replay, if requested by a hot-standby session via
6630                                  * SetRecoveryPause().
6631                                  *
6632                                  * Note that we intentionally don't take the info_lck spinlock
6633                                  * here.  We might therefore read a slightly stale value of
6634                                  * the recoveryPause flag, but it can't be very stale (no
6635                                  * worse than the last spinlock we did acquire).  Since a
6636                                  * pause request is a pretty asynchronous thing anyway,
6637                                  * possibly responding to it one WAL record later than we
6638                                  * otherwise would is a minor issue, so it doesn't seem worth
6639                                  * adding another spinlock cycle to prevent that.
6640                                  */
6641                                 if (xlogctl->recoveryPause)
6642                                         recoveryPausesHere();
6643
6644                                 /*
6645                                  * Have we reached our recovery target?
6646                                  */
6647                                 if (recoveryStopsBefore(record))
6648                                 {
6649                                         reachedStopPoint = true;        /* see below */
6650                                         break;
6651                                 }
6652
6653                                 /*
6654                                  * If we've been asked to lag the master, wait on
6655                                  * latch until enough time has passed.
6656                                  */
6657                                 if (recoveryApplyDelay(record))
6658                                 {
6659                                         /*
6660                                          * We test for paused recovery again here. If
6661                                          * user sets delayed apply, it may be because
6662                                          * they expect to pause recovery in case of
6663                                          * problems, so we must test again here otherwise
6664                                          * pausing during the delay-wait wouldn't work.
6665                                          */
6666                                         if (xlogctl->recoveryPause)
6667                                                 recoveryPausesHere();
6668                                 }
6669
6670                                 /* Setup error traceback support for ereport() */
6671                                 errcallback.callback = rm_redo_error_callback;
6672                                 errcallback.arg = (void *) record;
6673                                 errcallback.previous = error_context_stack;
6674                                 error_context_stack = &errcallback;
6675
6676                                 /*
6677                                  * ShmemVariableCache->nextXid must be beyond record's xid.
6678                                  *
6679                                  * We don't expect anyone else to modify nextXid, hence we
6680                                  * don't need to hold a lock while examining it.  We still
6681                                  * acquire the lock to modify it, though.
6682                                  */
6683                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6684                                                                                                  ShmemVariableCache->nextXid))
6685                                 {
6686                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6687                                         ShmemVariableCache->nextXid = record->xl_xid;
6688                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6689                                         LWLockRelease(XidGenLock);
6690                                 }
6691
6692                                 /*
6693                                  * Before replaying this record, check if this record causes
6694                                  * the current timeline to change. The record is already
6695                                  * considered to be part of the new timeline, so we update
6696                                  * ThisTimeLineID before replaying it. That's important so
6697                                  * that replayEndTLI, which is recorded as the minimum
6698                                  * recovery point's TLI if recovery stops after this record,
6699                                  * is set correctly.
6700                                  */
6701                                 if (record->xl_rmid == RM_XLOG_ID)
6702                                 {
6703                                         TimeLineID      newTLI = ThisTimeLineID;
6704                                         TimeLineID      prevTLI = ThisTimeLineID;
6705                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6706
6707                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
6708                                         {
6709                                                 CheckPoint      checkPoint;
6710
6711                                                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6712                                                 newTLI = checkPoint.ThisTimeLineID;
6713                                                 prevTLI = checkPoint.PrevTimeLineID;
6714                                         }
6715                                         else if (info == XLOG_END_OF_RECOVERY)
6716                                         {
6717                                                 xl_end_of_recovery xlrec;
6718
6719                                                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
6720                                                 newTLI = xlrec.ThisTimeLineID;
6721                                                 prevTLI = xlrec.PrevTimeLineID;
6722                                         }
6723
6724                                         if (newTLI != ThisTimeLineID)
6725                                         {
6726                                                 /* Check that it's OK to switch to this TLI */
6727                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
6728
6729                                                 /* Following WAL records should be run with new TLI */
6730                                                 ThisTimeLineID = newTLI;
6731                                                 switchedTLI = true;
6732                                         }
6733                                 }
6734
6735                                 /*
6736                                  * Update shared replayEndRecPtr before replaying this record,
6737                                  * so that XLogFlush will update minRecoveryPoint correctly.
6738                                  */
6739                                 SpinLockAcquire(&xlogctl->info_lck);
6740                                 xlogctl->replayEndRecPtr = EndRecPtr;
6741                                 xlogctl->replayEndTLI = ThisTimeLineID;
6742                                 SpinLockRelease(&xlogctl->info_lck);
6743
6744                                 /*
6745                                  * If we are attempting to enter Hot Standby mode, process
6746                                  * XIDs we see
6747                                  */
6748                                 if (standbyState >= STANDBY_INITIALIZED &&
6749                                         TransactionIdIsValid(record->xl_xid))
6750                                         RecordKnownAssignedTransactionIds(record->xl_xid);
6751
6752                                 /* Now apply the WAL record itself */
6753                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
6754
6755                                 /* Pop the error context stack */
6756                                 error_context_stack = errcallback.previous;
6757
6758                                 /*
6759                                  * Update lastReplayedEndRecPtr after this record has been
6760                                  * successfully replayed.
6761                                  */
6762                                 SpinLockAcquire(&xlogctl->info_lck);
6763                                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
6764                                 xlogctl->lastReplayedTLI = ThisTimeLineID;
6765                                 SpinLockRelease(&xlogctl->info_lck);
6766
6767                                 /* Remember this record as the last-applied one */
6768                                 LastRec = ReadRecPtr;
6769
6770                                 /* Allow read-only connections if we're consistent now */
6771                                 CheckRecoveryConsistency();
6772
6773                                 /*
6774                                  * If this record was a timeline switch, wake up any
6775                                  * walsenders to notice that we are on a new timeline.
6776                                  */
6777                                 if (switchedTLI && AllowCascadeReplication())
6778                                         WalSndWakeup();
6779
6780                                 /* Exit loop if we reached inclusive recovery target */
6781                                 if (recoveryStopsAfter(record))
6782                                 {
6783                                         reachedStopPoint = true;
6784                                         break;
6785                                 }
6786
6787                                 /* Else, try to fetch the next WAL record */
6788                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6789                         } while (record != NULL);
6790
6791                         /*
6792                          * end of main redo apply loop
6793                          */
6794
6795                         if (recoveryPauseAtTarget && reachedStopPoint)
6796                         {
6797                                 SetRecoveryPause(true);
6798                                 recoveryPausesHere();
6799                         }
6800
6801                         /* Allow resource managers to do any required cleanup. */
6802                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6803                         {
6804                                 if (RmgrTable[rmid].rm_cleanup != NULL)
6805                                         RmgrTable[rmid].rm_cleanup();
6806                         }
6807
6808                         ereport(LOG,
6809                                         (errmsg("redo done at %X/%X",
6810                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6811                         xtime = GetLatestXTime();
6812                         if (xtime)
6813                                 ereport(LOG,
6814                                          (errmsg("last completed transaction was at log time %s",
6815                                                          timestamptz_to_str(xtime))));
6816                         InRedo = false;
6817                 }
6818                 else
6819                 {
6820                         /* there are no WAL records following the checkpoint */
6821                         ereport(LOG,
6822                                         (errmsg("redo is not required")));
6823                 }
6824         }
6825
6826         /*
6827          * Kill WAL receiver, if it's still running, before we continue to write
6828          * the startup checkpoint record. It will trump over the checkpoint and
6829          * subsequent records if it's still alive when we start writing WAL.
6830          */
6831         ShutdownWalRcv();
6832
6833         /*
6834          * We don't need the latch anymore. It's not strictly necessary to disown
6835          * it, but let's do it for the sake of tidiness.
6836          */
6837         if (StandbyModeRequested)
6838                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
6839
6840         /*
6841          * We are now done reading the xlog from stream. Turn off streaming
6842          * recovery to force fetching the files (which would be required at end of
6843          * recovery, e.g., timeline history file) from archive or pg_xlog.
6844          */
6845         StandbyMode = false;
6846
6847         /*
6848          * Re-fetch the last valid or last applied record, so we can identify the
6849          * exact endpoint of what we consider the valid portion of WAL.
6850          */
6851         record = ReadRecord(xlogreader, LastRec, PANIC, false);
6852         EndOfLog = EndRecPtr;
6853         XLByteToPrevSeg(EndOfLog, endLogSegNo);
6854
6855         /*
6856          * Complain if we did not roll forward far enough to render the backup
6857          * dump consistent.  Note: it is indeed okay to look at the local variable
6858          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
6859          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
6860          * advanced beyond the WAL we processed.
6861          */
6862         if (InRecovery &&
6863                 (EndOfLog < minRecoveryPoint ||
6864                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6865         {
6866                 if (reachedStopPoint)
6867                 {
6868                         /* stopped because of stop request */
6869                         ereport(FATAL,
6870                                         (errmsg("requested recovery stop point is before consistent recovery point")));
6871                 }
6872
6873                 /*
6874                  * Ran off end of WAL before reaching end-of-backup WAL record, or
6875                  * minRecoveryPoint. That's usually a bad sign, indicating that you
6876                  * tried to recover from an online backup but never called
6877                  * pg_stop_backup(), or you didn't archive all the WAL up to that
6878                  * point. However, this also happens in crash recovery, if the system
6879                  * crashes while an online backup is in progress. We must not treat
6880                  * that as an error, or the database will refuse to start up.
6881                  */
6882                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
6883                 {
6884                         if (ControlFile->backupEndRequired)
6885                                 ereport(FATAL,
6886                                                 (errmsg("WAL ends before end of online backup"),
6887                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
6888                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6889                                 ereport(FATAL,
6890                                                 (errmsg("WAL ends before end of online backup"),
6891                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
6892                         else
6893                                 ereport(FATAL,
6894                                           (errmsg("WAL ends before consistent recovery point")));
6895                 }
6896         }
6897
6898         /*
6899          * Consider whether we need to assign a new timeline ID.
6900          *
6901          * If we are doing an archive recovery, we always assign a new ID.      This
6902          * handles a couple of issues.  If we stopped short of the end of WAL
6903          * during recovery, then we are clearly generating a new timeline and must
6904          * assign it a unique new ID.  Even if we ran to the end, modifying the
6905          * current last segment is problematic because it may result in trying to
6906          * overwrite an already-archived copy of that segment, and we encourage
6907          * DBAs to make their archive_commands reject that.  We can dodge the
6908          * problem by making the new active segment have a new timeline ID.
6909          *
6910          * In a normal crash recovery, we can just extend the timeline we were in.
6911          */
6912         PrevTimeLineID = ThisTimeLineID;
6913         if (ArchiveRecoveryRequested)
6914         {
6915                 char            reason[200];
6916
6917                 Assert(InArchiveRecovery);
6918
6919                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
6920                 ereport(LOG,
6921                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
6922
6923                 /*
6924                  * Create a comment for the history file to explain why and where
6925                  * timeline changed.
6926                  */
6927                 if (recoveryTarget == RECOVERY_TARGET_XID)
6928                         snprintf(reason, sizeof(reason),
6929                                          "%s transaction %u",
6930                                          recoveryStopAfter ? "after" : "before",
6931                                          recoveryStopXid);
6932                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6933                         snprintf(reason, sizeof(reason),
6934                                          "%s %s\n",
6935                                          recoveryStopAfter ? "after" : "before",
6936                                          timestamptz_to_str(recoveryStopTime));
6937                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6938                         snprintf(reason, sizeof(reason),
6939                                          "at restore point \"%s\"",
6940                                          recoveryStopName);
6941                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6942                         snprintf(reason, sizeof(reason), "reached consistency");
6943                 else
6944                         snprintf(reason, sizeof(reason), "no recovery target specified");
6945
6946                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
6947                                                          EndRecPtr, reason);
6948         }
6949
6950         /* Save the selected TimeLineID in shared memory, too */
6951         XLogCtl->ThisTimeLineID = ThisTimeLineID;
6952         XLogCtl->PrevTimeLineID = PrevTimeLineID;
6953
6954         /*
6955          * We are now done reading the old WAL.  Turn off archive fetching if it
6956          * was active, and make a writable copy of the last WAL segment. (Note
6957          * that we also have a copy of the last block of the old WAL in readBuf;
6958          * we will use that below.)
6959          */
6960         if (ArchiveRecoveryRequested)
6961                 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
6962
6963         /*
6964          * Prepare to write WAL starting at EndOfLog position, and init xlog
6965          * buffer cache using the block containing the last record from the
6966          * previous incarnation.
6967          */
6968         openLogSegNo = endLogSegNo;
6969         openLogFile = XLogFileOpen(openLogSegNo);
6970         openLogOff = 0;
6971         Insert = &XLogCtl->Insert;
6972         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
6973         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
6974
6975         /*
6976          * Tricky point here: readBuf contains the *last* block that the LastRec
6977          * record spans, not the one it starts in.      The last block is indeed the
6978          * one we want to use.
6979          */
6980         if (EndOfLog % XLOG_BLCKSZ != 0)
6981         {
6982                 char       *page;
6983                 int                     len;
6984                 int                     firstIdx;
6985                 XLogRecPtr      pageBeginPtr;
6986
6987                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
6988                 Assert(readOff == pageBeginPtr % XLogSegSize);
6989
6990                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
6991
6992                 /* Copy the valid part of the last block, and zero the rest */
6993                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
6994                 len = EndOfLog % XLOG_BLCKSZ;
6995                 memcpy(page, xlogreader->readBuf, len);
6996                 memset(page + len, 0, XLOG_BLCKSZ - len);
6997
6998                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
6999                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7000         }
7001         else
7002         {
7003                 /*
7004                  * There is no partial block to copy. Just set InitializedUpTo,
7005                  * and let the first attempt to insert a log record to initialize
7006                  * the next buffer.
7007                  */
7008                 XLogCtl->InitializedUpTo = EndOfLog;
7009         }
7010
7011         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7012
7013         XLogCtl->LogwrtResult = LogwrtResult;
7014
7015         XLogCtl->LogwrtRqst.Write = EndOfLog;
7016         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7017
7018         /* Pre-scan prepared transactions to find out the range of XIDs present */
7019         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7020
7021         /*
7022          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7023          * record before resource manager writes cleanup WAL records or checkpoint
7024          * record is written.
7025          */
7026         Insert->fullPageWrites = lastFullPageWrites;
7027         LocalSetXLogInsertAllowed();
7028         UpdateFullPageWrites();
7029         LocalXLogInsertAllowed = -1;
7030
7031         if (InRecovery)
7032         {
7033                 /*
7034                  * Perform a checkpoint to update all our recovery activity to disk.
7035                  *
7036                  * Note that we write a shutdown checkpoint rather than an on-line
7037                  * one. This is not particularly critical, but since we may be
7038                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7039                  * the rule that TLI only changes in shutdown checkpoints, which
7040                  * allows some extra error checking in xlog_redo.
7041                  *
7042                  * In fast promotion, only create a lightweight end-of-recovery record
7043                  * instead of a full checkpoint. A checkpoint is requested later,
7044                  * after we're fully out of recovery mode and already accepting
7045                  * queries.
7046                  */
7047                 if (bgwriterLaunched)
7048                 {
7049                         if (fast_promote)
7050                         {
7051                                 checkPointLoc = ControlFile->prevCheckPoint;
7052
7053                                 /*
7054                                  * Confirm the last checkpoint is available for us to recover
7055                                  * from if we fail. Note that we don't check for the secondary
7056                                  * checkpoint since that isn't available in most base backups.
7057                                  */
7058                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7059                                 if (record != NULL)
7060                                 {
7061                                         fast_promoted = true;
7062
7063                                         /*
7064                                          * Insert a special WAL record to mark the end of
7065                                          * recovery, since we aren't doing a checkpoint. That
7066                                          * means that the checkpointer process may likely be in
7067                                          * the middle of a time-smoothed restartpoint and could
7068                                          * continue to be for minutes after this. That sounds
7069                                          * strange, but the effect is roughly the same and it
7070                                          * would be stranger to try to come out of the
7071                                          * restartpoint and then checkpoint. We request a
7072                                          * checkpoint later anyway, just for safety.
7073                                          */
7074                                         CreateEndOfRecoveryRecord();
7075                                 }
7076                         }
7077
7078                         if (!fast_promoted)
7079                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7080                                                                   CHECKPOINT_IMMEDIATE |
7081                                                                   CHECKPOINT_WAIT);
7082                 }
7083                 else
7084                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7085
7086                 /*
7087                  * And finally, execute the recovery_end_command, if any.
7088                  */
7089                 if (recoveryEndCommand)
7090                         ExecuteRecoveryCommand(recoveryEndCommand,
7091                                                                    "recovery_end_command",
7092                                                                    true);
7093         }
7094
7095         /*
7096          * Preallocate additional log files, if wanted.
7097          */
7098         PreallocXlogFiles(EndOfLog);
7099
7100         /*
7101          * Reset initial contents of unlogged relations.  This has to be done
7102          * AFTER recovery is complete so that any unlogged relations created
7103          * during recovery also get picked up.
7104          */
7105         if (InRecovery)
7106                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7107
7108         /*
7109          * Okay, we're officially UP.
7110          */
7111         InRecovery = false;
7112
7113         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7114         ControlFile->state = DB_IN_PRODUCTION;
7115         ControlFile->time = (pg_time_t) time(NULL);
7116         UpdateControlFile();
7117         LWLockRelease(ControlFileLock);
7118
7119         /* start the archive_timeout timer running */
7120         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7121
7122         /* also initialize latestCompletedXid, to nextXid - 1 */
7123         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7124         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7125         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7126         LWLockRelease(ProcArrayLock);
7127
7128         /*
7129          * Start up the commit log and subtrans, if not already done for hot
7130          * standby.
7131          */
7132         if (standbyState == STANDBY_DISABLED)
7133         {
7134                 StartupCLOG();
7135                 StartupSUBTRANS(oldestActiveXID);
7136         }
7137
7138         /*
7139          * Perform end of recovery actions for any SLRUs that need it.
7140          */
7141         TrimCLOG();
7142         TrimMultiXact();
7143
7144         /* Reload shared-memory state for prepared transactions */
7145         RecoverPreparedTransactions();
7146
7147         /*
7148          * Shutdown the recovery environment. This must occur after
7149          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7150          */
7151         if (standbyState != STANDBY_DISABLED)
7152                 ShutdownRecoveryTransactionEnvironment();
7153
7154         /* Shut down xlogreader */
7155         if (readFile >= 0)
7156         {
7157                 close(readFile);
7158                 readFile = -1;
7159         }
7160         XLogReaderFree(xlogreader);
7161
7162         /*
7163          * If any of the critical GUCs have changed, log them before we allow
7164          * backends to write WAL.
7165          */
7166         LocalSetXLogInsertAllowed();
7167         XLogReportParameters();
7168
7169         /*
7170          * All done.  Allow backends to write WAL.      (Although the bool flag is
7171          * probably atomic in itself, we use the info_lck here to ensure that
7172          * there are no race conditions concerning visibility of other recent
7173          * updates to shared memory.)
7174          */
7175         {
7176                 /* use volatile pointer to prevent code rearrangement */
7177                 volatile XLogCtlData *xlogctl = XLogCtl;
7178
7179                 SpinLockAcquire(&xlogctl->info_lck);
7180                 xlogctl->SharedRecoveryInProgress = false;
7181                 SpinLockRelease(&xlogctl->info_lck);
7182         }
7183
7184         /*
7185          * If there were cascading standby servers connected to us, nudge any wal
7186          * sender processes to notice that we've been promoted.
7187          */
7188         WalSndWakeup();
7189
7190         /*
7191          * If this was a fast promotion, request an (online) checkpoint now. This
7192          * isn't required for consistency, but the last restartpoint might be far
7193          * back, and in case of a crash, recovering from it might take a longer
7194          * than is appropriate now that we're not in standby mode anymore.
7195          */
7196         if (fast_promoted)
7197                 RequestCheckpoint(CHECKPOINT_FORCE);
7198 }
7199
7200 /*
7201  * Checks if recovery has reached a consistent state. When consistency is
7202  * reached and we have a valid starting standby snapshot, tell postmaster
7203  * that it can start accepting read-only connections.
7204  */
7205 static void
7206 CheckRecoveryConsistency(void)
7207 {
7208         XLogRecPtr lastReplayedEndRecPtr;
7209
7210         /*
7211          * During crash recovery, we don't reach a consistent state until we've
7212          * replayed all the WAL.
7213          */
7214         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7215                 return;
7216
7217         /*
7218          * assume that we are called in the startup process, and hence don't need
7219          * a lock to read lastReplayedEndRecPtr
7220          */
7221         lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7222
7223         /*
7224          * Have we reached the point where our base backup was completed?
7225          */
7226         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7227                 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7228         {
7229                 /*
7230                  * We have reached the end of base backup, as indicated by pg_control.
7231                  * The data on disk is now consistent. Reset backupStartPoint and
7232                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7233                  * allow starting up at an earlier point even if recovery is stopped
7234                  * and restarted soon after this.
7235                  */
7236                 elog(DEBUG1, "end of backup reached");
7237
7238                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7239
7240                 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7241                         ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7242
7243                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7244                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7245                 ControlFile->backupEndRequired = false;
7246                 UpdateControlFile();
7247
7248                 LWLockRelease(ControlFileLock);
7249         }
7250
7251         /*
7252          * Have we passed our safe starting point? Note that minRecoveryPoint is
7253          * known to be incorrectly set if ControlFile->backupEndRequired, until
7254          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7255          * minRecoveryPoint. All we know prior to that is that we're not
7256          * consistent yet.
7257          */
7258         if (!reachedConsistency && !ControlFile->backupEndRequired &&
7259                 minRecoveryPoint <= lastReplayedEndRecPtr &&
7260                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7261         {
7262                 /*
7263                  * Check to see if the XLOG sequence contained any unresolved
7264                  * references to uninitialized pages.
7265                  */
7266                 XLogCheckInvalidPages();
7267
7268                 reachedConsistency = true;
7269                 ereport(LOG,
7270                                 (errmsg("consistent recovery state reached at %X/%X",
7271                                                 (uint32) (lastReplayedEndRecPtr >> 32),
7272                                                 (uint32) lastReplayedEndRecPtr)));
7273         }
7274
7275         /*
7276          * Have we got a valid starting snapshot that will allow queries to be
7277          * run? If so, we can tell postmaster that the database is consistent now,
7278          * enabling connections.
7279          */
7280         if (standbyState == STANDBY_SNAPSHOT_READY &&
7281                 !LocalHotStandbyActive &&
7282                 reachedConsistency &&
7283                 IsUnderPostmaster)
7284         {
7285                 /* use volatile pointer to prevent code rearrangement */
7286                 volatile XLogCtlData *xlogctl = XLogCtl;
7287
7288                 SpinLockAcquire(&xlogctl->info_lck);
7289                 xlogctl->SharedHotStandbyActive = true;
7290                 SpinLockRelease(&xlogctl->info_lck);
7291
7292                 LocalHotStandbyActive = true;
7293
7294                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7295         }
7296 }
7297
7298 /*
7299  * Is the system still in recovery?
7300  *
7301  * Unlike testing InRecovery, this works in any process that's connected to
7302  * shared memory.
7303  *
7304  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7305  * variables the first time we see that recovery is finished.
7306  */
7307 bool
7308 RecoveryInProgress(void)
7309 {
7310         /*
7311          * We check shared state each time only until we leave recovery mode. We
7312          * can't re-enter recovery, so there's no need to keep checking after the
7313          * shared variable has once been seen false.
7314          */
7315         if (!LocalRecoveryInProgress)
7316                 return false;
7317         else
7318         {
7319                 /*
7320                  * use volatile pointer to make sure we make a fresh read of the
7321                  * shared variable.
7322                  */
7323                 volatile XLogCtlData *xlogctl = XLogCtl;
7324
7325                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7326
7327                 /*
7328                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7329                  * is finished. InitPostgres() relies upon this behaviour to ensure
7330                  * that InitXLOGAccess() is called at backend startup.  (If you change
7331                  * this, see also LocalSetXLogInsertAllowed.)
7332                  */
7333                 if (!LocalRecoveryInProgress)
7334                 {
7335                         /*
7336                          * If we just exited recovery, make sure we read TimeLineID and
7337                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
7338                          * weak memory ordering).
7339                          */
7340                         pg_memory_barrier();
7341                         InitXLOGAccess();
7342                 }
7343                 /*
7344                  * Note: We don't need a memory barrier when we're still in recovery.
7345                  * We might exit recovery immediately after return, so the caller
7346                  * can't rely on 'true' meaning that we're still in recovery anyway.
7347                  */
7348
7349                 return LocalRecoveryInProgress;
7350         }
7351 }
7352
7353 /*
7354  * Is HotStandby active yet? This is only important in special backends
7355  * since normal backends won't ever be able to connect until this returns
7356  * true. Postmaster knows this by way of signal, not via shared memory.
7357  *
7358  * Unlike testing standbyState, this works in any process that's connected to
7359  * shared memory.  (And note that standbyState alone doesn't tell the truth
7360  * anyway.)
7361  */
7362 bool
7363 HotStandbyActive(void)
7364 {
7365         /*
7366          * We check shared state each time only until Hot Standby is active. We
7367          * can't de-activate Hot Standby, so there's no need to keep checking
7368          * after the shared variable has once been seen true.
7369          */
7370         if (LocalHotStandbyActive)
7371                 return true;
7372         else
7373         {
7374                 /* use volatile pointer to prevent code rearrangement */
7375                 volatile XLogCtlData *xlogctl = XLogCtl;
7376
7377                 /* spinlock is essential on machines with weak memory ordering! */
7378                 SpinLockAcquire(&xlogctl->info_lck);
7379                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
7380                 SpinLockRelease(&xlogctl->info_lck);
7381
7382                 return LocalHotStandbyActive;
7383         }
7384 }
7385
7386 /*
7387  * Like HotStandbyActive(), but to be used only in WAL replay code,
7388  * where we don't need to ask any other process what the state is.
7389  */
7390 bool
7391 HotStandbyActiveInReplay(void)
7392 {
7393         Assert(AmStartupProcess());
7394         return LocalHotStandbyActive;
7395 }
7396
7397 /*
7398  * Is this process allowed to insert new WAL records?
7399  *
7400  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7401  * But we also have provisions for forcing the result "true" or "false"
7402  * within specific processes regardless of the global state.
7403  */
7404 bool
7405 XLogInsertAllowed(void)
7406 {
7407         /*
7408          * If value is "unconditionally true" or "unconditionally false", just
7409          * return it.  This provides the normal fast path once recovery is known
7410          * done.
7411          */
7412         if (LocalXLogInsertAllowed >= 0)
7413                 return (bool) LocalXLogInsertAllowed;
7414
7415         /*
7416          * Else, must check to see if we're still in recovery.
7417          */
7418         if (RecoveryInProgress())
7419                 return false;
7420
7421         /*
7422          * On exit from recovery, reset to "unconditionally true", since there is
7423          * no need to keep checking.
7424          */
7425         LocalXLogInsertAllowed = 1;
7426         return true;
7427 }
7428
7429 /*
7430  * Make XLogInsertAllowed() return true in the current process only.
7431  *
7432  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7433  * and even call LocalSetXLogInsertAllowed() again after that.
7434  */
7435 static void
7436 LocalSetXLogInsertAllowed(void)
7437 {
7438         Assert(LocalXLogInsertAllowed == -1);
7439         LocalXLogInsertAllowed = 1;
7440
7441         /* Initialize as RecoveryInProgress() would do when switching state */
7442         InitXLOGAccess();
7443 }
7444
7445 /*
7446  * Subroutine to try to fetch and validate a prior checkpoint record.
7447  *
7448  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7449  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7450  */
7451 static XLogRecord *
7452 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7453                                          int whichChkpt, bool report)
7454 {
7455         XLogRecord *record;
7456
7457         if (!XRecOffIsValid(RecPtr))
7458         {
7459                 if (!report)
7460                         return NULL;
7461
7462                 switch (whichChkpt)
7463                 {
7464                         case 1:
7465                                 ereport(LOG,
7466                                 (errmsg("invalid primary checkpoint link in control file")));
7467                                 break;
7468                         case 2:
7469                                 ereport(LOG,
7470                                                 (errmsg("invalid secondary checkpoint link in control file")));
7471                                 break;
7472                         default:
7473                                 ereport(LOG,
7474                                    (errmsg("invalid checkpoint link in backup_label file")));
7475                                 break;
7476                 }
7477                 return NULL;
7478         }
7479
7480         record = ReadRecord(xlogreader, RecPtr, LOG, true);
7481
7482         if (record == NULL)
7483         {
7484                 if (!report)
7485                         return NULL;
7486
7487                 switch (whichChkpt)
7488                 {
7489                         case 1:
7490                                 ereport(LOG,
7491                                                 (errmsg("invalid primary checkpoint record")));
7492                                 break;
7493                         case 2:
7494                                 ereport(LOG,
7495                                                 (errmsg("invalid secondary checkpoint record")));
7496                                 break;
7497                         default:
7498                                 ereport(LOG,
7499                                                 (errmsg("invalid checkpoint record")));
7500                                 break;
7501                 }
7502                 return NULL;
7503         }
7504         if (record->xl_rmid != RM_XLOG_ID)
7505         {
7506                 switch (whichChkpt)
7507                 {
7508                         case 1:
7509                                 ereport(LOG,
7510                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7511                                 break;
7512                         case 2:
7513                                 ereport(LOG,
7514                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7515                                 break;
7516                         default:
7517                                 ereport(LOG,
7518                                 (errmsg("invalid resource manager ID in checkpoint record")));
7519                                 break;
7520                 }
7521                 return NULL;
7522         }
7523         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7524                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7525         {
7526                 switch (whichChkpt)
7527                 {
7528                         case 1:
7529                                 ereport(LOG,
7530                                    (errmsg("invalid xl_info in primary checkpoint record")));
7531                                 break;
7532                         case 2:
7533                                 ereport(LOG,
7534                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7535                                 break;
7536                         default:
7537                                 ereport(LOG,
7538                                                 (errmsg("invalid xl_info in checkpoint record")));
7539                                 break;
7540                 }
7541                 return NULL;
7542         }
7543         if (record->xl_len != sizeof(CheckPoint) ||
7544                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
7545         {
7546                 switch (whichChkpt)
7547                 {
7548                         case 1:
7549                                 ereport(LOG,
7550                                         (errmsg("invalid length of primary checkpoint record")));
7551                                 break;
7552                         case 2:
7553                                 ereport(LOG,
7554                                   (errmsg("invalid length of secondary checkpoint record")));
7555                                 break;
7556                         default:
7557                                 ereport(LOG,
7558                                                 (errmsg("invalid length of checkpoint record")));
7559                                 break;
7560                 }
7561                 return NULL;
7562         }
7563         return record;
7564 }
7565
7566 /*
7567  * This must be called during startup of a backend process, except that
7568  * it need not be called in a standalone backend (which does StartupXLOG
7569  * instead).  We need to initialize the local copies of ThisTimeLineID and
7570  * RedoRecPtr.
7571  *
7572  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7573  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7574  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7575  */
7576 void
7577 InitXLOGAccess(void)
7578 {
7579         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7580         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7581         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7582
7583         /* Initialize our copy of WALInsertLocks and register the tranche */
7584         WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
7585         LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId,
7586                                                   &XLogCtl->Insert.WALInsertLockTranche);
7587
7588         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7589         (void) GetRedoRecPtr();
7590 }
7591
7592 /*
7593  * Return the current Redo pointer from shared memory.
7594  *
7595  * As a side-effect, the local RedoRecPtr copy is updated.
7596  */
7597 XLogRecPtr
7598 GetRedoRecPtr(void)
7599 {
7600         /* use volatile pointer to prevent code rearrangement */
7601         volatile XLogCtlData *xlogctl = XLogCtl;
7602         XLogRecPtr ptr;
7603
7604         /*
7605          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
7606          * grabbed a WAL insertion lock to read the master copy, someone might
7607          * update it just after we've released the lock.
7608          */
7609         SpinLockAcquire(&xlogctl->info_lck);
7610         ptr = xlogctl->RedoRecPtr;
7611         SpinLockRelease(&xlogctl->info_lck);
7612
7613         if (RedoRecPtr < ptr)
7614                 RedoRecPtr = ptr;
7615
7616         return RedoRecPtr;
7617 }
7618
7619 /*
7620  * GetInsertRecPtr -- Returns the current insert position.
7621  *
7622  * NOTE: The value *actually* returned is the position of the last full
7623  * xlog page. It lags behind the real insert position by at most 1 page.
7624  * For that, we don't need to scan through WAL insertion locks, and an
7625  * approximation is enough for the current usage of this function.
7626  */
7627 XLogRecPtr
7628 GetInsertRecPtr(void)
7629 {
7630         /* use volatile pointer to prevent code rearrangement */
7631         volatile XLogCtlData *xlogctl = XLogCtl;
7632         XLogRecPtr      recptr;
7633
7634         SpinLockAcquire(&xlogctl->info_lck);
7635         recptr = xlogctl->LogwrtRqst.Write;
7636         SpinLockRelease(&xlogctl->info_lck);
7637
7638         return recptr;
7639 }
7640
7641 /*
7642  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7643  * position known to be fsync'd to disk.
7644  */
7645 XLogRecPtr
7646 GetFlushRecPtr(void)
7647 {
7648         /* use volatile pointer to prevent code rearrangement */
7649         volatile XLogCtlData *xlogctl = XLogCtl;
7650         XLogRecPtr      recptr;
7651
7652         SpinLockAcquire(&xlogctl->info_lck);
7653         recptr = xlogctl->LogwrtResult.Flush;
7654         SpinLockRelease(&xlogctl->info_lck);
7655
7656         return recptr;
7657 }
7658
7659 /*
7660  * Get the time of the last xlog segment switch
7661  */
7662 pg_time_t
7663 GetLastSegSwitchTime(void)
7664 {
7665         pg_time_t       result;
7666
7667         /* Need WALWriteLock, but shared lock is sufficient */
7668         LWLockAcquire(WALWriteLock, LW_SHARED);
7669         result = XLogCtl->lastSegSwitchTime;
7670         LWLockRelease(WALWriteLock);
7671
7672         return result;
7673 }
7674
7675 /*
7676  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7677  *
7678  * This is exported for use by code that would like to have 64-bit XIDs.
7679  * We don't really support such things, but all XIDs within the system
7680  * can be presumed "close to" the result, and thus the epoch associated
7681  * with them can be determined.
7682  */
7683 void
7684 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7685 {
7686         uint32          ckptXidEpoch;
7687         TransactionId ckptXid;
7688         TransactionId nextXid;
7689
7690         /* Must read checkpoint info first, else have race condition */
7691         {
7692                 /* use volatile pointer to prevent code rearrangement */
7693                 volatile XLogCtlData *xlogctl = XLogCtl;
7694
7695                 SpinLockAcquire(&xlogctl->info_lck);
7696                 ckptXidEpoch = xlogctl->ckptXidEpoch;
7697                 ckptXid = xlogctl->ckptXid;
7698                 SpinLockRelease(&xlogctl->info_lck);
7699         }
7700
7701         /* Now fetch current nextXid */
7702         nextXid = ReadNewTransactionId();
7703
7704         /*
7705          * nextXid is certainly logically later than ckptXid.  So if it's
7706          * numerically less, it must have wrapped into the next epoch.
7707          */
7708         if (nextXid < ckptXid)
7709                 ckptXidEpoch++;
7710
7711         *xid = nextXid;
7712         *epoch = ckptXidEpoch;
7713 }
7714
7715 /*
7716  * This must be called ONCE during postmaster or standalone-backend shutdown
7717  */
7718 void
7719 ShutdownXLOG(int code, Datum arg)
7720 {
7721         /* Don't be chatty in standalone mode */
7722         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7723                         (errmsg("shutting down")));
7724
7725         if (RecoveryInProgress())
7726                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7727         else
7728         {
7729                 /*
7730                  * If archiving is enabled, rotate the last XLOG file so that all the
7731                  * remaining records are archived (postmaster wakes up the archiver
7732                  * process one more time at the end of shutdown). The checkpoint
7733                  * record will go to the next XLOG file and won't be archived (yet).
7734                  */
7735                 if (XLogArchivingActive() && XLogArchiveCommandSet())
7736                         RequestXLogSwitch();
7737
7738                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7739         }
7740         ShutdownCLOG();
7741         ShutdownSUBTRANS();
7742         ShutdownMultiXact();
7743
7744         /* Don't be chatty in standalone mode */
7745         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7746                         (errmsg("database system is shut down")));
7747 }
7748
7749 /*
7750  * Log start of a checkpoint.
7751  */
7752 static void
7753 LogCheckpointStart(int flags, bool restartpoint)
7754 {
7755         const char *msg;
7756
7757         /*
7758          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
7759          * the main message, but what about all the flags?
7760          */
7761         if (restartpoint)
7762                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
7763         else
7764                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
7765
7766         elog(LOG, msg,
7767                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7768                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7769                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
7770                  (flags & CHECKPOINT_FORCE) ? " force" : "",
7771                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
7772                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
7773                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
7774 }
7775
7776 /*
7777  * Log end of a checkpoint.
7778  */
7779 static void
7780 LogCheckpointEnd(bool restartpoint)
7781 {
7782         long            write_secs,
7783                                 sync_secs,
7784                                 total_secs,
7785                                 longest_secs,
7786                                 average_secs;
7787         int                     write_usecs,
7788                                 sync_usecs,
7789                                 total_usecs,
7790                                 longest_usecs,
7791                                 average_usecs;
7792         uint64          average_sync_time;
7793
7794         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
7795
7796         TimestampDifference(CheckpointStats.ckpt_write_t,
7797                                                 CheckpointStats.ckpt_sync_t,
7798                                                 &write_secs, &write_usecs);
7799
7800         TimestampDifference(CheckpointStats.ckpt_sync_t,
7801                                                 CheckpointStats.ckpt_sync_end_t,
7802                                                 &sync_secs, &sync_usecs);
7803
7804         /* Accumulate checkpoint timing summary data, in milliseconds. */
7805         BgWriterStats.m_checkpoint_write_time +=
7806                 write_secs * 1000 + write_usecs / 1000;
7807         BgWriterStats.m_checkpoint_sync_time +=
7808                 sync_secs * 1000 + sync_usecs / 1000;
7809
7810         /*
7811          * All of the published timing statistics are accounted for.  Only
7812          * continue if a log message is to be written.
7813          */
7814         if (!log_checkpoints)
7815                 return;
7816
7817         TimestampDifference(CheckpointStats.ckpt_start_t,
7818                                                 CheckpointStats.ckpt_end_t,
7819                                                 &total_secs, &total_usecs);
7820
7821         /*
7822          * Timing values returned from CheckpointStats are in microseconds.
7823          * Convert to the second plus microsecond form that TimestampDifference
7824          * returns for homogeneous printing.
7825          */
7826         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
7827         longest_usecs = CheckpointStats.ckpt_longest_sync -
7828                 (uint64) longest_secs *1000000;
7829
7830         average_sync_time = 0;
7831         if (CheckpointStats.ckpt_sync_rels > 0)
7832                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
7833                         CheckpointStats.ckpt_sync_rels;
7834         average_secs = (long) (average_sync_time / 1000000);
7835         average_usecs = average_sync_time - (uint64) average_secs *1000000;
7836
7837         if (restartpoint)
7838                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
7839                          "%d transaction log file(s) added, %d removed, %d recycled; "
7840                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7841                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7842                          CheckpointStats.ckpt_bufs_written,
7843                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7844                          CheckpointStats.ckpt_segs_added,
7845                          CheckpointStats.ckpt_segs_removed,
7846                          CheckpointStats.ckpt_segs_recycled,
7847                          write_secs, write_usecs / 1000,
7848                          sync_secs, sync_usecs / 1000,
7849                          total_secs, total_usecs / 1000,
7850                          CheckpointStats.ckpt_sync_rels,
7851                          longest_secs, longest_usecs / 1000,
7852                          average_secs, average_usecs / 1000);
7853         else
7854                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
7855                          "%d transaction log file(s) added, %d removed, %d recycled; "
7856                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7857                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7858                          CheckpointStats.ckpt_bufs_written,
7859                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7860                          CheckpointStats.ckpt_segs_added,
7861                          CheckpointStats.ckpt_segs_removed,
7862                          CheckpointStats.ckpt_segs_recycled,
7863                          write_secs, write_usecs / 1000,
7864                          sync_secs, sync_usecs / 1000,
7865                          total_secs, total_usecs / 1000,
7866                          CheckpointStats.ckpt_sync_rels,
7867                          longest_secs, longest_usecs / 1000,
7868                          average_secs, average_usecs / 1000);
7869 }
7870
7871 /*
7872  * Perform a checkpoint --- either during shutdown, or on-the-fly
7873  *
7874  * flags is a bitwise OR of the following:
7875  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7876  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7877  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7878  *              ignoring checkpoint_completion_target parameter.
7879  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
7880  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7881  *              CHECKPOINT_END_OF_RECOVERY).
7882  *
7883  * Note: flags contains other bits, of interest here only for logging purposes.
7884  * In particular note that this routine is synchronous and does not pay
7885  * attention to CHECKPOINT_WAIT.
7886  *
7887  * If !shutdown then we are writing an online checkpoint. This is a very special
7888  * kind of operation and WAL record because the checkpoint action occurs over
7889  * a period of time yet logically occurs at just a single LSN. The logical
7890  * position of the WAL record (redo ptr) is the same or earlier than the
7891  * physical position. When we replay WAL we locate the checkpoint via its
7892  * physical position then read the redo ptr and actually start replay at the
7893  * earlier logical position. Note that we don't write *anything* to WAL at
7894  * the logical position, so that location could be any other kind of WAL record.
7895  * All of this mechanism allows us to continue working while we checkpoint.
7896  * As a result, timing of actions is critical here and be careful to note that
7897  * this function will likely take minutes to execute on a busy system.
7898  */
7899 void
7900 CreateCheckPoint(int flags)
7901 {
7902         /* use volatile pointer to prevent code rearrangement */
7903         volatile XLogCtlData *xlogctl = XLogCtl;
7904         bool            shutdown;
7905         CheckPoint      checkPoint;
7906         XLogRecPtr      recptr;
7907         XLogCtlInsert *Insert = &XLogCtl->Insert;
7908         XLogRecData rdata;
7909         uint32          freespace;
7910         XLogSegNo       _logSegNo;
7911         XLogRecPtr      curInsert;
7912         VirtualTransactionId *vxids;
7913         int                     nvxids;
7914
7915         /*
7916          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7917          * issued at a different time.
7918          */
7919         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7920                 shutdown = true;
7921         else
7922                 shutdown = false;
7923
7924         /* sanity check */
7925         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7926                 elog(ERROR, "can't create a checkpoint during recovery");
7927
7928         /*
7929          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
7930          * (This is just pro forma, since in the present system structure there is
7931          * only one process that is allowed to issue checkpoints at any given
7932          * time.)
7933          */
7934         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7935
7936         /*
7937          * Prepare to accumulate statistics.
7938          *
7939          * Note: because it is possible for log_checkpoints to change while a
7940          * checkpoint proceeds, we always accumulate stats, even if
7941          * log_checkpoints is currently off.
7942          */
7943         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7944         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7945
7946         /*
7947          * Use a critical section to force system panic if we have trouble.
7948          */
7949         START_CRIT_SECTION();
7950
7951         if (shutdown)
7952         {
7953                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7954                 ControlFile->state = DB_SHUTDOWNING;
7955                 ControlFile->time = (pg_time_t) time(NULL);
7956                 UpdateControlFile();
7957                 LWLockRelease(ControlFileLock);
7958         }
7959
7960         /*
7961          * Let smgr prepare for checkpoint; this has to happen before we determine
7962          * the REDO pointer.  Note that smgr must not do anything that'd have to
7963          * be undone if we decide no checkpoint is needed.
7964          */
7965         smgrpreckpt();
7966
7967         /* Begin filling in the checkpoint WAL record */
7968         MemSet(&checkPoint, 0, sizeof(checkPoint));
7969         checkPoint.time = (pg_time_t) time(NULL);
7970
7971         /*
7972          * For Hot Standby, derive the oldestActiveXid before we fix the redo
7973          * pointer. This allows us to begin accumulating changes to assemble our
7974          * starting snapshot of locks and transactions.
7975          */
7976         if (!shutdown && XLogStandbyInfoActive())
7977                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
7978         else
7979                 checkPoint.oldestActiveXid = InvalidTransactionId;
7980
7981         /*
7982          * We must block concurrent insertions while examining insert state to
7983          * determine the checkpoint REDO pointer.
7984          */
7985         WALInsertLockAcquireExclusive();
7986         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
7987
7988         /*
7989          * If this isn't a shutdown or forced checkpoint, and we have not inserted
7990          * any XLOG records since the start of the last checkpoint, skip the
7991          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
7992          * when the system is idle. That wastes log space, and more importantly it
7993          * exposes us to possible loss of both current and previous checkpoint
7994          * records if the machine crashes just as we're writing the update.
7995          * (Perhaps it'd make even more sense to checkpoint only when the previous
7996          * checkpoint record is in a different xlog page?)
7997          *
7998          * We have to make two tests to determine that nothing has happened since
7999          * the start of the last checkpoint: current insertion point must match
8000          * the end of the last checkpoint record, and its redo pointer must point
8001          * to itself.
8002          */
8003         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8004                                   CHECKPOINT_FORCE)) == 0)
8005         {
8006                 if (curInsert == ControlFile->checkPoint +
8007                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
8008                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
8009                 {
8010                         WALInsertLockRelease();
8011                         LWLockRelease(CheckpointLock);
8012                         END_CRIT_SECTION();
8013                         return;
8014                 }
8015         }
8016
8017         /*
8018          * An end-of-recovery checkpoint is created before anyone is allowed to
8019          * write WAL. To allow us to write the checkpoint record, temporarily
8020          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8021          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8022          */
8023         if (flags & CHECKPOINT_END_OF_RECOVERY)
8024                 LocalSetXLogInsertAllowed();
8025
8026         checkPoint.ThisTimeLineID = ThisTimeLineID;
8027         if (flags & CHECKPOINT_END_OF_RECOVERY)
8028                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8029         else
8030                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8031
8032         checkPoint.fullPageWrites = Insert->fullPageWrites;
8033
8034         /*
8035          * Compute new REDO record ptr = location of next XLOG record.
8036          *
8037          * NB: this is NOT necessarily where the checkpoint record itself will be,
8038          * since other backends may insert more XLOG records while we're off doing
8039          * the buffer flush work.  Those XLOG records are logically after the
8040          * checkpoint, even though physically before it.  Got that?
8041          */
8042         freespace = INSERT_FREESPACE(curInsert);
8043         if (freespace == 0)
8044         {
8045                 if (curInsert % XLogSegSize == 0)
8046                         curInsert += SizeOfXLogLongPHD;
8047                 else
8048                         curInsert += SizeOfXLogShortPHD;
8049         }
8050         checkPoint.redo = curInsert;
8051
8052         /*
8053          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8054          * must be done while holding all the insertion locks.
8055          *
8056          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8057          * pointing past where it really needs to point.  This is okay; the only
8058          * consequence is that XLogInsert might back up whole buffers that it
8059          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8060          * XLogInserts that happen while we are dumping buffers must assume that
8061          * their buffer changes are not included in the checkpoint.
8062          */
8063         RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
8064
8065         /*
8066          * Now we can release the WAL insertion locks, allowing other xacts to
8067          * proceed while we are flushing disk buffers.
8068          */
8069         WALInsertLockRelease();
8070
8071         /* Update the info_lck-protected copy of RedoRecPtr as well */
8072         SpinLockAcquire(&xlogctl->info_lck);
8073         xlogctl->RedoRecPtr = checkPoint.redo;
8074         SpinLockRelease(&xlogctl->info_lck);
8075
8076         /*
8077          * If enabled, log checkpoint start.  We postpone this until now so as not
8078          * to log anything if we decided to skip the checkpoint.
8079          */
8080         if (log_checkpoints)
8081                 LogCheckpointStart(flags, false);
8082
8083         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8084
8085         /*
8086          * In some cases there are groups of actions that must all occur on one
8087          * side or the other of a checkpoint record. Before flushing the
8088          * checkpoint record we must explicitly wait for any backend currently
8089          * performing those groups of actions.
8090          *
8091          * One example is end of transaction, so we must wait for any transactions
8092          * that are currently in commit critical sections.      If an xact inserted
8093          * its commit record into XLOG just before the REDO point, then a crash
8094          * restart from the REDO point would not replay that record, which means
8095          * that our flushing had better include the xact's update of pg_clog.  So
8096          * we wait till he's out of his commit critical section before proceeding.
8097          * See notes in RecordTransactionCommit().
8098          *
8099          * Because we've already released the insertion locks, this test is a bit
8100          * fuzzy: it is possible that we will wait for xacts we didn't really need
8101          * to wait for.  But the delay should be short and it seems better to make
8102          * checkpoint take a bit longer than to hold off insertions longer than
8103          * necessary.
8104          * (In fact, the whole reason we have this issue is that xact.c does
8105          * commit record XLOG insertion and clog update as two separate steps
8106          * protected by different locks, but again that seems best on grounds of
8107          * minimizing lock contention.)
8108          *
8109          * A transaction that has not yet set delayChkpt when we look cannot be at
8110          * risk, since he's not inserted his commit record yet; and one that's
8111          * already cleared it is not at risk either, since he's done fixing clog
8112          * and we will correctly flush the update below.  So we cannot miss any
8113          * xacts we need to wait for.
8114          */
8115         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8116         if (nvxids > 0)
8117         {
8118                 do
8119                 {
8120                         pg_usleep(10000L);      /* wait for 10 msec */
8121                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8122         }
8123         pfree(vxids);
8124
8125         /*
8126          * Get the other info we need for the checkpoint record.
8127          */
8128         LWLockAcquire(XidGenLock, LW_SHARED);
8129         checkPoint.nextXid = ShmemVariableCache->nextXid;
8130         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8131         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8132         LWLockRelease(XidGenLock);
8133
8134         /* Increase XID epoch if we've wrapped around since last checkpoint */
8135         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8136         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8137                 checkPoint.nextXidEpoch++;
8138
8139         LWLockAcquire(OidGenLock, LW_SHARED);
8140         checkPoint.nextOid = ShmemVariableCache->nextOid;
8141         if (!shutdown)
8142                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8143         LWLockRelease(OidGenLock);
8144
8145         MultiXactGetCheckptMulti(shutdown,
8146                                                          &checkPoint.nextMulti,
8147                                                          &checkPoint.nextMultiOffset,
8148                                                          &checkPoint.oldestMulti,
8149                                                          &checkPoint.oldestMultiDB);
8150
8151         /*
8152          * Having constructed the checkpoint record, ensure all shmem disk buffers
8153          * and commit-log buffers are flushed to disk.
8154          *
8155          * This I/O could fail for various reasons.  If so, we will fail to
8156          * complete the checkpoint, but there is no reason to force a system
8157          * panic. Accordingly, exit critical section while doing it.
8158          */
8159         END_CRIT_SECTION();
8160
8161         CheckPointGuts(checkPoint.redo, flags);
8162
8163         /*
8164          * Take a snapshot of running transactions and write this to WAL. This
8165          * allows us to reconstruct the state of running transactions during
8166          * archive recovery, if required. Skip, if this info disabled.
8167          *
8168          * If we are shutting down, or Startup process is completing crash
8169          * recovery we don't need to write running xact data.
8170          */
8171         if (!shutdown && XLogStandbyInfoActive())
8172                 LogStandbySnapshot();
8173
8174         START_CRIT_SECTION();
8175
8176         /*
8177          * Now insert the checkpoint record into XLOG.
8178          */
8179         rdata.data = (char *) (&checkPoint);
8180         rdata.len = sizeof(checkPoint);
8181         rdata.buffer = InvalidBuffer;
8182         rdata.next = NULL;
8183
8184         recptr = XLogInsert(RM_XLOG_ID,
8185                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8186                                                 XLOG_CHECKPOINT_ONLINE,
8187                                                 &rdata);
8188
8189         XLogFlush(recptr);
8190
8191         /*
8192          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8193          * overwritten at next startup.  No-one should even try, this just allows
8194          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8195          * to just temporarily disable writing until the system has exited
8196          * recovery.
8197          */
8198         if (shutdown)
8199         {
8200                 if (flags & CHECKPOINT_END_OF_RECOVERY)
8201                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
8202                 else
8203                         LocalXLogInsertAllowed = 0; /* never again write WAL */
8204         }
8205
8206         /*
8207          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8208          * = end of actual checkpoint record.
8209          */
8210         if (shutdown && checkPoint.redo != ProcLastRecPtr)
8211                 ereport(PANIC,
8212                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
8213
8214         /*
8215          * Select point at which we can truncate the log, which we base on the
8216          * prior checkpoint's earliest info.
8217          */
8218         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8219
8220         /*
8221          * Update the control file.
8222          */
8223         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8224         if (shutdown)
8225                 ControlFile->state = DB_SHUTDOWNED;
8226         ControlFile->prevCheckPoint = ControlFile->checkPoint;
8227         ControlFile->checkPoint = ProcLastRecPtr;
8228         ControlFile->checkPointCopy = checkPoint;
8229         ControlFile->time = (pg_time_t) time(NULL);
8230         /* crash recovery should always recover to the end of WAL */
8231         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8232         ControlFile->minRecoveryPointTLI = 0;
8233
8234         /*
8235          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8236          * unused on non-shutdown checkpoints, but seems useful to store it always
8237          * for debugging purposes.
8238          */
8239         SpinLockAcquire(&XLogCtl->ulsn_lck);
8240         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8241         SpinLockRelease(&XLogCtl->ulsn_lck);
8242
8243         UpdateControlFile();
8244         LWLockRelease(ControlFileLock);
8245
8246         /* Update shared-memory copy of checkpoint XID/epoch */
8247         {
8248                 /* use volatile pointer to prevent code rearrangement */
8249                 volatile XLogCtlData *xlogctl = XLogCtl;
8250
8251                 SpinLockAcquire(&xlogctl->info_lck);
8252                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8253                 xlogctl->ckptXid = checkPoint.nextXid;
8254                 SpinLockRelease(&xlogctl->info_lck);
8255         }
8256
8257         /*
8258          * We are now done with critical updates; no need for system panic if we
8259          * have trouble while fooling with old log segments.
8260          */
8261         END_CRIT_SECTION();
8262
8263         /*
8264          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8265          */
8266         smgrpostckpt();
8267
8268         /*
8269          * Delete old log files (those no longer needed even for previous
8270          * checkpoint or the standbys in XLOG streaming).
8271          */
8272         if (_logSegNo)
8273         {
8274                 KeepLogSeg(recptr, &_logSegNo);
8275                 _logSegNo--;
8276                 RemoveOldXlogFiles(_logSegNo, recptr);
8277         }
8278
8279         /*
8280          * Make more log segments if needed.  (Do this after recycling old log
8281          * segments, since that may supply some of the needed files.)
8282          */
8283         if (!shutdown)
8284                 PreallocXlogFiles(recptr);
8285
8286         /*
8287          * Truncate pg_subtrans if possible.  We can throw away all data before
8288          * the oldest XMIN of any running transaction.  No future transaction will
8289          * attempt to reference any pg_subtrans entry older than that (see Asserts
8290          * in subtrans.c).      During recovery, though, we mustn't do this because
8291          * StartupSUBTRANS hasn't been called yet.
8292          */
8293         if (!RecoveryInProgress())
8294                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
8295
8296         /* Real work is done, but log and update stats before releasing lock. */
8297         LogCheckpointEnd(false);
8298
8299         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8300                                                                          NBuffers,
8301                                                                          CheckpointStats.ckpt_segs_added,
8302                                                                          CheckpointStats.ckpt_segs_removed,
8303                                                                          CheckpointStats.ckpt_segs_recycled);
8304
8305         LWLockRelease(CheckpointLock);
8306 }
8307
8308 /*
8309  * Mark the end of recovery in WAL though without running a full checkpoint.
8310  * We can expect that a restartpoint is likely to be in progress as we
8311  * do this, though we are unwilling to wait for it to complete. So be
8312  * careful to avoid taking the CheckpointLock anywhere here.
8313  *
8314  * CreateRestartPoint() allows for the case where recovery may end before
8315  * the restartpoint completes so there is no concern of concurrent behaviour.
8316  */
8317 void
8318 CreateEndOfRecoveryRecord(void)
8319 {
8320         xl_end_of_recovery xlrec;
8321         XLogRecData rdata;
8322         XLogRecPtr      recptr;
8323
8324         /* sanity check */
8325         if (!RecoveryInProgress())
8326                 elog(ERROR, "can only be used to end recovery");
8327
8328         xlrec.end_time = time(NULL);
8329
8330         WALInsertLockAcquireExclusive();
8331         xlrec.ThisTimeLineID = ThisTimeLineID;
8332         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8333         WALInsertLockRelease();
8334
8335         LocalSetXLogInsertAllowed();
8336
8337         START_CRIT_SECTION();
8338
8339         rdata.data = (char *) &xlrec;
8340         rdata.len = sizeof(xl_end_of_recovery);
8341         rdata.buffer = InvalidBuffer;
8342         rdata.next = NULL;
8343
8344         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
8345
8346         XLogFlush(recptr);
8347
8348         /*
8349          * Update the control file so that crash recovery can follow the timeline
8350          * changes to this point.
8351          */
8352         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8353         ControlFile->time = (pg_time_t) xlrec.end_time;
8354         ControlFile->minRecoveryPoint = recptr;
8355         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8356         UpdateControlFile();
8357         LWLockRelease(ControlFileLock);
8358
8359         END_CRIT_SECTION();
8360
8361         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8362 }
8363
8364 /*
8365  * Flush all data in shared memory to disk, and fsync
8366  *
8367  * This is the common code shared between regular checkpoints and
8368  * recovery restartpoints.
8369  */
8370 static void
8371 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8372 {
8373         CheckPointCLOG();
8374         CheckPointSUBTRANS();
8375         CheckPointMultiXact();
8376         CheckPointPredicate();
8377         CheckPointRelationMap();
8378         CheckPointReplicationSlots();
8379         CheckPointSnapBuild();
8380         CheckPointLogicalRewriteHeap();
8381         CheckPointBuffers(flags);       /* performs all required fsyncs */
8382         /* We deliberately delay 2PC checkpointing as long as possible */
8383         CheckPointTwoPhase(checkPointRedo);
8384 }
8385
8386 /*
8387  * Save a checkpoint for recovery restart if appropriate
8388  *
8389  * This function is called each time a checkpoint record is read from XLOG.
8390  * It must determine whether the checkpoint represents a safe restartpoint or
8391  * not.  If so, the checkpoint record is stashed in shared memory so that
8392  * CreateRestartPoint can consult it.  (Note that the latter function is
8393  * executed by the checkpointer, while this one will be executed by the
8394  * startup process.)
8395  */
8396 static void
8397 RecoveryRestartPoint(const CheckPoint *checkPoint)
8398 {
8399         /* use volatile pointer to prevent code rearrangement */
8400         volatile XLogCtlData *xlogctl = XLogCtl;
8401
8402         /*
8403          * Also refrain from creating a restartpoint if we have seen any
8404          * references to non-existent pages. Restarting recovery from the
8405          * restartpoint would not see the references, so we would lose the
8406          * cross-check that the pages belonged to a relation that was dropped
8407          * later.
8408          */
8409         if (XLogHaveInvalidPages())
8410         {
8411                 elog(trace_recovery(DEBUG2),
8412                          "could not record restart point at %X/%X because there "
8413                          "are unresolved references to invalid pages",
8414                          (uint32) (checkPoint->redo >> 32),
8415                          (uint32) checkPoint->redo);
8416                 return;
8417         }
8418
8419         /*
8420          * Copy the checkpoint record to shared memory, so that checkpointer can
8421          * work out the next time it wants to perform a restartpoint.
8422          */
8423         SpinLockAcquire(&xlogctl->info_lck);
8424         xlogctl->lastCheckPointRecPtr = ReadRecPtr;
8425         xlogctl->lastCheckPoint = *checkPoint;
8426         SpinLockRelease(&xlogctl->info_lck);
8427 }
8428
8429 /*
8430  * Establish a restartpoint if possible.
8431  *
8432  * This is similar to CreateCheckPoint, but is used during WAL recovery
8433  * to establish a point from which recovery can roll forward without
8434  * replaying the entire recovery log.
8435  *
8436  * Returns true if a new restartpoint was established. We can only establish
8437  * a restartpoint if we have replayed a safe checkpoint record since last
8438  * restartpoint.
8439  */
8440 bool
8441 CreateRestartPoint(int flags)
8442 {
8443         XLogRecPtr      lastCheckPointRecPtr;
8444         CheckPoint      lastCheckPoint;
8445         XLogSegNo       _logSegNo;
8446         TimestampTz xtime;
8447
8448         /* use volatile pointer to prevent code rearrangement */
8449         volatile XLogCtlData *xlogctl = XLogCtl;
8450
8451         /*
8452          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8453          * happens at a time.
8454          */
8455         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8456
8457         /* Get a local copy of the last safe checkpoint record. */
8458         SpinLockAcquire(&xlogctl->info_lck);
8459         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
8460         lastCheckPoint = xlogctl->lastCheckPoint;
8461         SpinLockRelease(&xlogctl->info_lck);
8462
8463         /*
8464          * Check that we're still in recovery mode. It's ok if we exit recovery
8465          * mode after this check, the restart point is valid anyway.
8466          */
8467         if (!RecoveryInProgress())
8468         {
8469                 ereport(DEBUG2,
8470                           (errmsg("skipping restartpoint, recovery has already ended")));
8471                 LWLockRelease(CheckpointLock);
8472                 return false;
8473         }
8474
8475         /*
8476          * If the last checkpoint record we've replayed is already our last
8477          * restartpoint, we can't perform a new restart point. We still update
8478          * minRecoveryPoint in that case, so that if this is a shutdown restart
8479          * point, we won't start up earlier than before. That's not strictly
8480          * necessary, but when hot standby is enabled, it would be rather weird if
8481          * the database opened up for read-only connections at a point-in-time
8482          * before the last shutdown. Such time travel is still possible in case of
8483          * immediate shutdown, though.
8484          *
8485          * We don't explicitly advance minRecoveryPoint when we do create a
8486          * restartpoint. It's assumed that flushing the buffers will do that as a
8487          * side-effect.
8488          */
8489         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8490                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
8491         {
8492                 ereport(DEBUG2,
8493                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8494                                                 (uint32) (lastCheckPoint.redo >> 32),
8495                                                 (uint32) lastCheckPoint.redo)));
8496
8497                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8498                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8499                 {
8500                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8501                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8502                         ControlFile->time = (pg_time_t) time(NULL);
8503                         UpdateControlFile();
8504                         LWLockRelease(ControlFileLock);
8505                 }
8506                 LWLockRelease(CheckpointLock);
8507                 return false;
8508         }
8509
8510         /*
8511          * Update the shared RedoRecPtr so that the startup process can calculate
8512          * the number of segments replayed since last restartpoint, and request a
8513          * restartpoint if it exceeds checkpoint_segments.
8514          *
8515          * Like in CreateCheckPoint(), hold off insertions to update it, although
8516          * during recovery this is just pro forma, because no WAL insertions are
8517          * happening.
8518          */
8519         WALInsertLockAcquireExclusive();
8520         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
8521         WALInsertLockRelease();
8522
8523         /* Also update the info_lck-protected copy */
8524         SpinLockAcquire(&xlogctl->info_lck);
8525         xlogctl->RedoRecPtr = lastCheckPoint.redo;
8526         SpinLockRelease(&xlogctl->info_lck);
8527
8528         /*
8529          * Prepare to accumulate statistics.
8530          *
8531          * Note: because it is possible for log_checkpoints to change while a
8532          * checkpoint proceeds, we always accumulate stats, even if
8533          * log_checkpoints is currently off.
8534          */
8535         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8536         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8537
8538         if (log_checkpoints)
8539                 LogCheckpointStart(flags, true);
8540
8541         CheckPointGuts(lastCheckPoint.redo, flags);
8542
8543         /*
8544          * Select point at which we can truncate the xlog, which we base on the
8545          * prior checkpoint's earliest info.
8546          */
8547         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8548
8549         /*
8550          * Update pg_control, using current time.  Check that it still shows
8551          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8552          * this is a quick hack to make sure nothing really bad happens if somehow
8553          * we get here after the end-of-recovery checkpoint.
8554          */
8555         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8556         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8557                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
8558         {
8559                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8560                 ControlFile->checkPoint = lastCheckPointRecPtr;
8561                 ControlFile->checkPointCopy = lastCheckPoint;
8562                 ControlFile->time = (pg_time_t) time(NULL);
8563                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8564                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8565                 UpdateControlFile();
8566         }
8567         LWLockRelease(ControlFileLock);
8568
8569         /*
8570          * Due to an historical accident multixact truncations are not WAL-logged,
8571          * but just performed everytime the mxact horizon is increased. So, unless
8572          * we explicitly execute truncations on a standby it will never clean out
8573          * /pg_multixact which obviously is bad, both because it uses space and
8574          * because we can wrap around into pre-existing data...
8575          *
8576          * We can only do the truncation here, after the UpdateControlFile()
8577          * above, because we've now safely established a restart point, that
8578          * guarantees we will not need need to access those multis.
8579          *
8580          * It's probably worth improving this.
8581          */
8582         TruncateMultiXact(lastCheckPoint.oldestMulti);
8583
8584         /*
8585          * Delete old log files (those no longer needed even for previous
8586          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8587          * growing full.
8588          */
8589         if (_logSegNo)
8590         {
8591                 XLogRecPtr      receivePtr;
8592                 XLogRecPtr      replayPtr;
8593                 TimeLineID      replayTLI;
8594                 XLogRecPtr      endptr;
8595
8596                 /*
8597                  * Get the current end of xlog replayed or received, whichever is
8598                  * later.
8599                  */
8600                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
8601                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
8602                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
8603
8604                 KeepLogSeg(endptr, &_logSegNo);
8605                 _logSegNo--;
8606
8607                 /*
8608                  * Try to recycle segments on a useful timeline. If we've been promoted
8609                  * since the beginning of this restartpoint, use the new timeline
8610                  * chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
8611                  * in that case). If we're still in recovery, use the timeline we're
8612                  * currently replaying.
8613                  *
8614                  * There is no guarantee that the WAL segments will be useful on the
8615                  * current timeline; if recovery proceeds to a new timeline right
8616                  * after this, the pre-allocated WAL segments on this timeline will
8617                  * not be used, and will go wasted until recycled on the next
8618                  * restartpoint. We'll live with that.
8619                  */
8620                 if (RecoveryInProgress())
8621                         ThisTimeLineID = replayTLI;
8622
8623                 RemoveOldXlogFiles(_logSegNo, endptr);
8624
8625                 /*
8626                  * Make more log segments if needed.  (Do this after recycling old log
8627                  * segments, since that may supply some of the needed files.)
8628                  */
8629                 PreallocXlogFiles(endptr);
8630
8631                 /*
8632                  * ThisTimeLineID is normally not set when we're still in recovery.
8633                  * However, recycling/preallocating segments above needed
8634                  * ThisTimeLineID to determine which timeline to install the segments
8635                  * on. Reset it now, to restore the normal state of affairs for
8636                  * debugging purposes.
8637                  */
8638                 if (RecoveryInProgress())
8639                         ThisTimeLineID = 0;
8640         }
8641
8642         /*
8643          * Truncate pg_subtrans if possible.  We can throw away all data before
8644          * the oldest XMIN of any running transaction.  No future transaction will
8645          * attempt to reference any pg_subtrans entry older than that (see Asserts
8646          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
8647          * this because StartupSUBTRANS hasn't been called yet.
8648          */
8649         if (EnableHotStandby)
8650                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
8651
8652         /* Real work is done, but log and update before releasing lock. */
8653         LogCheckpointEnd(true);
8654
8655         xtime = GetLatestXTime();
8656         ereport((log_checkpoints ? LOG : DEBUG2),
8657                         (errmsg("recovery restart point at %X/%X",
8658                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
8659                    xtime ? errdetail("last completed transaction was at log time %s",
8660                                                          timestamptz_to_str(xtime)) : 0));
8661
8662         LWLockRelease(CheckpointLock);
8663
8664         /*
8665          * Finally, execute archive_cleanup_command, if any.
8666          */
8667         if (XLogCtl->archiveCleanupCommand[0])
8668                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8669                                                            "archive_cleanup_command",
8670                                                            false);
8671
8672         return true;
8673 }
8674
8675 /*
8676  * Retreat *logSegNo to the last segment that we need to retain because of
8677  * either wal_keep_segments or replication slots.
8678  *
8679  * This is calculated by subtracting wal_keep_segments from the given xlog
8680  * location, recptr and by making sure that that result is below the
8681  * requirement of replication slots.
8682  */
8683 static void
8684 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
8685 {
8686         XLogSegNo       segno;
8687         XLogRecPtr      keep;
8688
8689         XLByteToSeg(recptr, segno);
8690         keep = XLogGetReplicationSlotMinimumLSN();
8691
8692         /* compute limit for wal_keep_segments first */
8693         if (wal_keep_segments > 0)
8694         {
8695                 /* avoid underflow, don't go below 1 */
8696                 if (segno <= wal_keep_segments)
8697                         segno = 1;
8698                 else
8699                         segno = segno - wal_keep_segments;
8700         }
8701
8702         /* then check whether slots limit removal further */
8703         if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
8704         {
8705                 XLogRecPtr slotSegNo;
8706
8707                 XLByteToSeg(keep, slotSegNo);
8708
8709                 if (slotSegNo <= 0)
8710                         segno = 1;
8711                 else if (slotSegNo < segno)
8712                         segno = slotSegNo;
8713         }
8714
8715         /* don't delete WAL segments newer than the calculated segment */
8716         if (segno < *logSegNo)
8717                 *logSegNo = segno;
8718 }
8719
8720 /*
8721  * Write a NEXTOID log record
8722  */
8723 void
8724 XLogPutNextOid(Oid nextOid)
8725 {
8726         XLogRecData rdata;
8727
8728         rdata.data = (char *) (&nextOid);
8729         rdata.len = sizeof(Oid);
8730         rdata.buffer = InvalidBuffer;
8731         rdata.next = NULL;
8732         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
8733
8734         /*
8735          * We need not flush the NEXTOID record immediately, because any of the
8736          * just-allocated OIDs could only reach disk as part of a tuple insert or
8737          * update that would have its own XLOG record that must follow the NEXTOID
8738          * record.      Therefore, the standard buffer LSN interlock applied to those
8739          * records will ensure no such OID reaches disk before the NEXTOID record
8740          * does.
8741          *
8742          * Note, however, that the above statement only covers state "within" the
8743          * database.  When we use a generated OID as a file or directory name, we
8744          * are in a sense violating the basic WAL rule, because that filesystem
8745          * change may reach disk before the NEXTOID WAL record does.  The impact
8746          * of this is that if a database crash occurs immediately afterward, we
8747          * might after restart re-generate the same OID and find that it conflicts
8748          * with the leftover file or directory.  But since for safety's sake we
8749          * always loop until finding a nonconflicting filename, this poses no real
8750          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8751          */
8752 }
8753
8754 /*
8755  * Write an XLOG SWITCH record.
8756  *
8757  * Here we just blindly issue an XLogInsert request for the record.
8758  * All the magic happens inside XLogInsert.
8759  *
8760  * The return value is either the end+1 address of the switch record,
8761  * or the end+1 address of the prior segment if we did not need to
8762  * write a switch record because we are already at segment start.
8763  */
8764 XLogRecPtr
8765 RequestXLogSwitch(void)
8766 {
8767         XLogRecPtr      RecPtr;
8768         XLogRecData rdata;
8769
8770         /* XLOG SWITCH, alone among xlog record types, has no data */
8771         rdata.buffer = InvalidBuffer;
8772         rdata.data = NULL;
8773         rdata.len = 0;
8774         rdata.next = NULL;
8775
8776         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
8777
8778         return RecPtr;
8779 }
8780
8781 /*
8782  * Write a RESTORE POINT record
8783  */
8784 XLogRecPtr
8785 XLogRestorePoint(const char *rpName)
8786 {
8787         XLogRecPtr      RecPtr;
8788         XLogRecData rdata;
8789         xl_restore_point xlrec;
8790
8791         xlrec.rp_time = GetCurrentTimestamp();
8792         strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8793
8794         rdata.buffer = InvalidBuffer;
8795         rdata.data = (char *) &xlrec;
8796         rdata.len = sizeof(xl_restore_point);
8797         rdata.next = NULL;
8798
8799         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
8800
8801         ereport(LOG,
8802                         (errmsg("restore point \"%s\" created at %X/%X",
8803                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
8804
8805         return RecPtr;
8806 }
8807
8808 /*
8809  * Write a backup block if needed when we are setting a hint. Note that
8810  * this may be called for a variety of page types, not just heaps.
8811  *
8812  * Callable while holding just share lock on the buffer content.
8813  *
8814  * We can't use the plain backup block mechanism since that relies on the
8815  * Buffer being exclusively locked. Since some modifications (setting LSN, hint
8816  * bits) are allowed in a sharelocked buffer that can lead to wal checksum
8817  * failures. So instead we copy the page and insert the copied data as normal
8818  * record data.
8819  *
8820  * We only need to do something if page has not yet been full page written in
8821  * this checkpoint round. The LSN of the inserted wal record is returned if we
8822  * had to write, InvalidXLogRecPtr otherwise.
8823  *
8824  * It is possible that multiple concurrent backends could attempt to write WAL
8825  * records. In that case, multiple copies of the same block would be recorded
8826  * in separate WAL records by different backends, though that is still OK from
8827  * a correctness perspective.
8828  */
8829 XLogRecPtr
8830 XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
8831 {
8832         XLogRecPtr      recptr = InvalidXLogRecPtr;
8833         XLogRecPtr      lsn;
8834         XLogRecData rdata[2];
8835         BkpBlock        bkpb;
8836
8837         /*
8838          * Ensure no checkpoint can change our view of RedoRecPtr.
8839          */
8840         Assert(MyPgXact->delayChkpt);
8841
8842         /*
8843          * Update RedoRecPtr so XLogCheckBuffer can make the right decision
8844          */
8845         GetRedoRecPtr();
8846
8847         /*
8848          * Setup phony rdata element for use within XLogCheckBuffer only. We reuse
8849          * and reset rdata for any actual WAL record insert.
8850          */
8851         rdata[0].buffer = buffer;
8852         rdata[0].buffer_std = buffer_std;
8853
8854         /*
8855          * Check buffer while not holding an exclusive lock.
8856          */
8857         if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
8858         {
8859                 char            copied_buffer[BLCKSZ];
8860                 char       *origdata = (char *) BufferGetBlock(buffer);
8861
8862                 /*
8863                  * Copy buffer so we don't have to worry about concurrent hint bit or
8864                  * lsn updates. We assume pd_lower/upper cannot be changed without an
8865                  * exclusive lock, so the contents bkp are not racy.
8866                  *
8867                  * With buffer_std set to false, XLogCheckBuffer() sets hole_length and
8868                  * hole_offset to 0; so the following code is safe for either case.
8869                  */
8870                 memcpy(copied_buffer, origdata, bkpb.hole_offset);
8871                 memcpy(copied_buffer + bkpb.hole_offset,
8872                            origdata + bkpb.hole_offset + bkpb.hole_length,
8873                            BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
8874
8875                 /*
8876                  * Header for backup block.
8877                  */
8878                 rdata[0].data = (char *) &bkpb;
8879                 rdata[0].len = sizeof(BkpBlock);
8880                 rdata[0].buffer = InvalidBuffer;
8881                 rdata[0].next = &(rdata[1]);
8882
8883                 /*
8884                  * Save copy of the buffer.
8885                  */
8886                 rdata[1].data = copied_buffer;
8887                 rdata[1].len = BLCKSZ - bkpb.hole_length;
8888                 rdata[1].buffer = InvalidBuffer;
8889                 rdata[1].next = NULL;
8890
8891                 recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata);
8892         }
8893
8894         return recptr;
8895 }
8896
8897 /*
8898  * Check if any of the GUC parameters that are critical for hot standby
8899  * have changed, and update the value in pg_control file if necessary.
8900  */
8901 static void
8902 XLogReportParameters(void)
8903 {
8904         if (wal_level != ControlFile->wal_level ||
8905                 wal_log_hints != ControlFile->wal_log_hints ||
8906                 MaxConnections != ControlFile->MaxConnections ||
8907                 max_worker_processes != ControlFile->max_worker_processes ||
8908                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
8909                 max_locks_per_xact != ControlFile->max_locks_per_xact)
8910         {
8911                 /*
8912                  * The change in number of backend slots doesn't need to be WAL-logged
8913                  * if archiving is not enabled, as you can't start archive recovery
8914                  * with wal_level=minimal anyway. We don't really care about the
8915                  * values in pg_control either if wal_level=minimal, but seems better
8916                  * to keep them up-to-date to avoid confusion.
8917                  */
8918                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8919                 {
8920                         XLogRecData rdata;
8921                         xl_parameter_change xlrec;
8922                         XLogRecPtr      recptr;
8923
8924                         xlrec.MaxConnections = MaxConnections;
8925                         xlrec.max_worker_processes = max_worker_processes;
8926                         xlrec.max_prepared_xacts = max_prepared_xacts;
8927                         xlrec.max_locks_per_xact = max_locks_per_xact;
8928                         xlrec.wal_level = wal_level;
8929                         xlrec.wal_log_hints = wal_log_hints;
8930
8931                         rdata.buffer = InvalidBuffer;
8932                         rdata.data = (char *) &xlrec;
8933                         rdata.len = sizeof(xlrec);
8934                         rdata.next = NULL;
8935
8936                         recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
8937                         XLogFlush(recptr);
8938                 }
8939
8940                 ControlFile->MaxConnections = MaxConnections;
8941                 ControlFile->max_worker_processes = max_worker_processes;
8942                 ControlFile->max_prepared_xacts = max_prepared_xacts;
8943                 ControlFile->max_locks_per_xact = max_locks_per_xact;
8944                 ControlFile->wal_level = wal_level;
8945                 ControlFile->wal_log_hints = wal_log_hints;
8946                 UpdateControlFile();
8947         }
8948 }
8949
8950 /*
8951  * Update full_page_writes in shared memory, and write an
8952  * XLOG_FPW_CHANGE record if necessary.
8953  *
8954  * Note: this function assumes there is no other process running
8955  * concurrently that could update it.
8956  */
8957 void
8958 UpdateFullPageWrites(void)
8959 {
8960         XLogCtlInsert *Insert = &XLogCtl->Insert;
8961
8962         /*
8963          * Do nothing if full_page_writes has not been changed.
8964          *
8965          * It's safe to check the shared full_page_writes without the lock,
8966          * because we assume that there is no concurrently running process which
8967          * can update it.
8968          */
8969         if (fullPageWrites == Insert->fullPageWrites)
8970                 return;
8971
8972         START_CRIT_SECTION();
8973
8974         /*
8975          * It's always safe to take full page images, even when not strictly
8976          * required, but not the other round. So if we're setting full_page_writes
8977          * to true, first set it true and then write the WAL record. If we're
8978          * setting it to false, first write the WAL record and then set the global
8979          * flag.
8980          */
8981         if (fullPageWrites)
8982         {
8983                 WALInsertLockAcquireExclusive();
8984                 Insert->fullPageWrites = true;
8985                 WALInsertLockRelease();
8986         }
8987
8988         /*
8989          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
8990          * full_page_writes during archive recovery, if required.
8991          */
8992         if (XLogStandbyInfoActive() && !RecoveryInProgress())
8993         {
8994                 XLogRecData rdata;
8995
8996                 rdata.data = (char *) (&fullPageWrites);
8997                 rdata.len = sizeof(bool);
8998                 rdata.buffer = InvalidBuffer;
8999                 rdata.next = NULL;
9000
9001                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
9002         }
9003
9004         if (!fullPageWrites)
9005         {
9006                 WALInsertLockAcquireExclusive();
9007                 Insert->fullPageWrites = false;
9008                 WALInsertLockRelease();
9009         }
9010         END_CRIT_SECTION();
9011 }
9012
9013 /*
9014  * Check that it's OK to switch to new timeline during recovery.
9015  *
9016  * 'lsn' is the address of the shutdown checkpoint record we're about to
9017  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9018  */
9019 static void
9020 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9021 {
9022         /* Check that the record agrees on what the current (old) timeline is */
9023         if (prevTLI != ThisTimeLineID)
9024                 ereport(PANIC,
9025                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9026                                                 prevTLI, ThisTimeLineID)));
9027
9028         /*
9029          * The new timeline better be in the list of timelines we expect to see,
9030          * according to the timeline history. It should also not decrease.
9031          */
9032         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9033                 ereport(PANIC,
9034                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9035                                  newTLI, ThisTimeLineID)));
9036
9037         /*
9038          * If we have not yet reached min recovery point, and we're about to
9039          * switch to a timeline greater than the timeline of the min recovery
9040          * point: trouble. After switching to the new timeline, we could not
9041          * possibly visit the min recovery point on the correct timeline anymore.
9042          * This can happen if there is a newer timeline in the archive that
9043          * branched before the timeline the min recovery point is on, and you
9044          * attempt to do PITR to the new timeline.
9045          */
9046         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9047                 lsn < minRecoveryPoint &&
9048                 newTLI > minRecoveryPointTLI)
9049                 ereport(PANIC,
9050                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9051                                                 newTLI,
9052                                                 (uint32) (minRecoveryPoint >> 32),
9053                                                 (uint32) minRecoveryPoint,
9054                                                 minRecoveryPointTLI)));
9055
9056         /* Looks good */
9057 }
9058
9059 /*
9060  * XLOG resource manager's routines
9061  *
9062  * Definitions of info values are in include/catalog/pg_control.h, though
9063  * not all record types are related to control file updates.
9064  */
9065 void
9066 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
9067 {
9068         uint8           info = record->xl_info & ~XLR_INFO_MASK;
9069
9070         /* Backup blocks are not used by XLOG rmgr */
9071         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
9072
9073         if (info == XLOG_NEXTOID)
9074         {
9075                 Oid                     nextOid;
9076
9077                 /*
9078                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9079                  * and the recorded nextOid, but that fails if the OID counter wraps
9080                  * around.      Since no OID allocation should be happening during replay
9081                  * anyway, better to just believe the record exactly.  We still take
9082                  * OidGenLock while setting the variable, just in case.
9083                  */
9084                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9085                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9086                 ShmemVariableCache->nextOid = nextOid;
9087                 ShmemVariableCache->oidCount = 0;
9088                 LWLockRelease(OidGenLock);
9089         }
9090         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9091         {
9092                 CheckPoint      checkPoint;
9093
9094                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9095                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9096                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9097                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9098                 LWLockRelease(XidGenLock);
9099                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9100                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9101                 ShmemVariableCache->oidCount = 0;
9102                 LWLockRelease(OidGenLock);
9103                 MultiXactSetNextMXact(checkPoint.nextMulti,
9104                                                           checkPoint.nextMultiOffset);
9105                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9106                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
9107
9108                 /*
9109                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9110                  * record, the backup was canceled and the end-of-backup record will
9111                  * never arrive.
9112                  */
9113                 if (ArchiveRecoveryRequested &&
9114                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9115                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9116                         ereport(PANIC,
9117                         (errmsg("online backup was canceled, recovery cannot continue")));
9118
9119                 /*
9120                  * If we see a shutdown checkpoint, we know that nothing was running
9121                  * on the master at this point. So fake-up an empty running-xacts
9122                  * record and use that here and now. Recover additional standby state
9123                  * for prepared transactions.
9124                  */
9125                 if (standbyState >= STANDBY_INITIALIZED)
9126                 {
9127                         TransactionId *xids;
9128                         int                     nxids;
9129                         TransactionId oldestActiveXID;
9130                         TransactionId latestCompletedXid;
9131                         RunningTransactionsData running;
9132
9133                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9134
9135                         /*
9136                          * Construct a RunningTransactions snapshot representing a shut
9137                          * down server, with only prepared transactions still alive. We're
9138                          * never overflowed at this point because all subxids are listed
9139                          * with their parent prepared transactions.
9140                          */
9141                         running.xcnt = nxids;
9142                         running.subxcnt = 0;
9143                         running.subxid_overflow = false;
9144                         running.nextXid = checkPoint.nextXid;
9145                         running.oldestRunningXid = oldestActiveXID;
9146                         latestCompletedXid = checkPoint.nextXid;
9147                         TransactionIdRetreat(latestCompletedXid);
9148                         Assert(TransactionIdIsNormal(latestCompletedXid));
9149                         running.latestCompletedXid = latestCompletedXid;
9150                         running.xids = xids;
9151
9152                         ProcArrayApplyRecoveryInfo(&running);
9153
9154                         StandbyRecoverPreparedTransactions(true);
9155                 }
9156
9157                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9158                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9159                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9160
9161                 /* Update shared-memory copy of checkpoint XID/epoch */
9162                 {
9163                         /* use volatile pointer to prevent code rearrangement */
9164                         volatile XLogCtlData *xlogctl = XLogCtl;
9165
9166                         SpinLockAcquire(&xlogctl->info_lck);
9167                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9168                         xlogctl->ckptXid = checkPoint.nextXid;
9169                         SpinLockRelease(&xlogctl->info_lck);
9170                 }
9171
9172                 /*
9173                  * We should've already switched to the new TLI before replaying this
9174                  * record.
9175                  */
9176                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9177                         ereport(PANIC,
9178                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9179                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9180
9181                 RecoveryRestartPoint(&checkPoint);
9182         }
9183         else if (info == XLOG_CHECKPOINT_ONLINE)
9184         {
9185                 CheckPoint      checkPoint;
9186
9187                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9188                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9189                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9190                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9191                                                                   checkPoint.nextXid))
9192                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9193                 LWLockRelease(XidGenLock);
9194                 /* ... but still treat OID counter as exact */
9195                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9196                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9197                 ShmemVariableCache->oidCount = 0;
9198                 LWLockRelease(OidGenLock);
9199                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9200                                                                   checkPoint.nextMultiOffset);
9201                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9202                                                                   checkPoint.oldestXid))
9203                         SetTransactionIdLimit(checkPoint.oldestXid,
9204                                                                   checkPoint.oldestXidDB);
9205                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9206                                                            checkPoint.oldestMultiDB);
9207
9208                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9209                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9210                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9211
9212                 /* Update shared-memory copy of checkpoint XID/epoch */
9213                 {
9214                         /* use volatile pointer to prevent code rearrangement */
9215                         volatile XLogCtlData *xlogctl = XLogCtl;
9216
9217                         SpinLockAcquire(&xlogctl->info_lck);
9218                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9219                         xlogctl->ckptXid = checkPoint.nextXid;
9220                         SpinLockRelease(&xlogctl->info_lck);
9221                 }
9222
9223                 /* TLI should not change in an on-line checkpoint */
9224                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9225                         ereport(PANIC,
9226                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9227                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9228
9229                 RecoveryRestartPoint(&checkPoint);
9230         }
9231         else if (info == XLOG_END_OF_RECOVERY)
9232         {
9233                 xl_end_of_recovery xlrec;
9234
9235                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9236
9237                 /*
9238                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9239                  * but this case is rarer and harder to test, so the benefit doesn't
9240                  * outweigh the potential extra cost of maintenance.
9241                  */
9242
9243                 /*
9244                  * We should've already switched to the new TLI before replaying this
9245                  * record.
9246                  */
9247                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9248                         ereport(PANIC,
9249                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9250                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9251         }
9252         else if (info == XLOG_NOOP)
9253         {
9254                 /* nothing to do here */
9255         }
9256         else if (info == XLOG_SWITCH)
9257         {
9258                 /* nothing to do here */
9259         }
9260         else if (info == XLOG_RESTORE_POINT)
9261         {
9262                 /* nothing to do here */
9263         }
9264         else if (info == XLOG_FPI)
9265         {
9266                 char       *data;
9267                 BkpBlock        bkpb;
9268
9269                 /*
9270                  * Full-page image (FPI) records contain a backup block stored "inline"
9271                  * in the normal data since the locking when writing hint records isn't
9272                  * sufficient to use the normal backup block mechanism, which assumes
9273                  * exclusive lock on the buffer supplied.
9274                  *
9275                  * Since the only change in these backup block are hint bits, there
9276                  * are no recovery conflicts generated.
9277                  *
9278                  * This also means there is no corresponding API call for this, so an
9279                  * smgr implementation has no need to implement anything. Which means
9280                  * nothing is needed in md.c etc
9281                  */
9282                 data = XLogRecGetData(record);
9283                 memcpy(&bkpb, data, sizeof(BkpBlock));
9284                 data += sizeof(BkpBlock);
9285
9286                 RestoreBackupBlockContents(lsn, bkpb, data, false, false);
9287         }
9288         else if (info == XLOG_BACKUP_END)
9289         {
9290                 XLogRecPtr      startpoint;
9291
9292                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9293
9294                 if (ControlFile->backupStartPoint == startpoint)
9295                 {
9296                         /*
9297                          * We have reached the end of base backup, the point where
9298                          * pg_stop_backup() was done. The data on disk is now consistent.
9299                          * Reset backupStartPoint, and update minRecoveryPoint to make
9300                          * sure we don't allow starting up at an earlier point even if
9301                          * recovery is stopped and restarted soon after this.
9302                          */
9303                         elog(DEBUG1, "end of backup reached");
9304
9305                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9306
9307                         if (ControlFile->minRecoveryPoint < lsn)
9308                         {
9309                                 ControlFile->minRecoveryPoint = lsn;
9310                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9311                         }
9312                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
9313                         ControlFile->backupEndRequired = false;
9314                         UpdateControlFile();
9315
9316                         LWLockRelease(ControlFileLock);
9317                 }
9318         }
9319         else if (info == XLOG_PARAMETER_CHANGE)
9320         {
9321                 xl_parameter_change xlrec;
9322
9323                 /* Update our copy of the parameters in pg_control */
9324                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9325
9326                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9327                 ControlFile->MaxConnections = xlrec.MaxConnections;
9328                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9329                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9330                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9331                 ControlFile->wal_level = xlrec.wal_level;
9332                 ControlFile->wal_log_hints = wal_log_hints;
9333
9334                 /*
9335                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
9336                  * recover back up to this point before allowing hot standby again.
9337                  * This is particularly important if wal_level was set to 'archive'
9338                  * before, and is now 'hot_standby', to ensure you don't run queries
9339                  * against the WAL preceding the wal_level change. Same applies to
9340                  * decreasing max_* settings.
9341                  */
9342                 minRecoveryPoint = ControlFile->minRecoveryPoint;
9343                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9344                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9345                 {
9346                         ControlFile->minRecoveryPoint = lsn;
9347                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9348                 }
9349
9350                 UpdateControlFile();
9351                 LWLockRelease(ControlFileLock);
9352
9353                 /* Check to see if any changes to max_connections give problems */
9354                 CheckRequiredParameterValues();
9355         }
9356         else if (info == XLOG_FPW_CHANGE)
9357         {
9358                 /* use volatile pointer to prevent code rearrangement */
9359                 volatile XLogCtlData *xlogctl = XLogCtl;
9360                 bool            fpw;
9361
9362                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9363
9364                 /*
9365                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9366                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
9367                  * full_page_writes has been disabled during online backup.
9368                  */
9369                 if (!fpw)
9370                 {
9371                         SpinLockAcquire(&xlogctl->info_lck);
9372                         if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
9373                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
9374                         SpinLockRelease(&xlogctl->info_lck);
9375                 }
9376
9377                 /* Keep track of full_page_writes */
9378                 lastFullPageWrites = fpw;
9379         }
9380 }
9381
9382 #ifdef WAL_DEBUG
9383
9384 static void
9385 xlog_outrec(StringInfo buf, XLogRecord *record)
9386 {
9387         int                     i;
9388
9389         appendStringInfo(buf, "prev %X/%X; xid %u",
9390                                          (uint32) (record->xl_prev >> 32),
9391                                          (uint32) record->xl_prev,
9392                                          record->xl_xid);
9393
9394         appendStringInfo(buf, "; len %u",
9395                                          record->xl_len);
9396
9397         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
9398         {
9399                 if (record->xl_info & XLR_BKP_BLOCK(i))
9400                         appendStringInfo(buf, "; bkpb%d", i);
9401         }
9402
9403         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
9404 }
9405 #endif   /* WAL_DEBUG */
9406
9407
9408 /*
9409  * Return the (possible) sync flag used for opening a file, depending on the
9410  * value of the GUC wal_sync_method.
9411  */
9412 static int
9413 get_sync_bit(int method)
9414 {
9415         int                     o_direct_flag = 0;
9416
9417         /* If fsync is disabled, never open in sync mode */
9418         if (!enableFsync)
9419                 return 0;
9420
9421         /*
9422          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9423          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9424          * disabled, otherwise the archive command or walsender process will read
9425          * the WAL soon after writing it, which is guaranteed to cause a physical
9426          * read if we bypassed the kernel cache. We also skip the
9427          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9428          * reason.
9429          *
9430          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9431          * written by walreceiver is normally read by the startup process soon
9432          * after its written. Also, walreceiver performs unaligned writes, which
9433          * don't work with O_DIRECT, so it is required for correctness too.
9434          */
9435         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9436                 o_direct_flag = PG_O_DIRECT;
9437
9438         switch (method)
9439         {
9440                         /*
9441                          * enum values for all sync options are defined even if they are
9442                          * not supported on the current platform.  But if not, they are
9443                          * not included in the enum option array, and therefore will never
9444                          * be seen here.
9445                          */
9446                 case SYNC_METHOD_FSYNC:
9447                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9448                 case SYNC_METHOD_FDATASYNC:
9449                         return 0;
9450 #ifdef OPEN_SYNC_FLAG
9451                 case SYNC_METHOD_OPEN:
9452                         return OPEN_SYNC_FLAG | o_direct_flag;
9453 #endif
9454 #ifdef OPEN_DATASYNC_FLAG
9455                 case SYNC_METHOD_OPEN_DSYNC:
9456                         return OPEN_DATASYNC_FLAG | o_direct_flag;
9457 #endif
9458                 default:
9459                         /* can't happen (unless we are out of sync with option array) */
9460                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
9461                         return 0;                       /* silence warning */
9462         }
9463 }
9464
9465 /*
9466  * GUC support
9467  */
9468 void
9469 assign_xlog_sync_method(int new_sync_method, void *extra)
9470 {
9471         if (sync_method != new_sync_method)
9472         {
9473                 /*
9474                  * To ensure that no blocks escape unsynced, force an fsync on the
9475                  * currently open log segment (if any).  Also, if the open flag is
9476                  * changing, close the log file so it will be reopened (with new flag
9477                  * bit) at next use.
9478                  */
9479                 if (openLogFile >= 0)
9480                 {
9481                         if (pg_fsync(openLogFile) != 0)
9482                                 ereport(PANIC,
9483                                                 (errcode_for_file_access(),
9484                                                  errmsg("could not fsync log segment %s: %m",
9485                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
9486                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9487                                 XLogFileClose();
9488                 }
9489         }
9490 }
9491
9492
9493 /*
9494  * Issue appropriate kind of fsync (if any) for an XLOG output file.
9495  *
9496  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9497  * 'log' and 'seg' are for error reporting purposes.
9498  */
9499 void
9500 issue_xlog_fsync(int fd, XLogSegNo segno)
9501 {
9502         switch (sync_method)
9503         {
9504                 case SYNC_METHOD_FSYNC:
9505                         if (pg_fsync_no_writethrough(fd) != 0)
9506                                 ereport(PANIC,
9507                                                 (errcode_for_file_access(),
9508                                                  errmsg("could not fsync log file %s: %m",
9509                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9510                         break;
9511 #ifdef HAVE_FSYNC_WRITETHROUGH
9512                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9513                         if (pg_fsync_writethrough(fd) != 0)
9514                                 ereport(PANIC,
9515                                                 (errcode_for_file_access(),
9516                                           errmsg("could not fsync write-through log file %s: %m",
9517                                                          XLogFileNameP(ThisTimeLineID, segno))));
9518                         break;
9519 #endif
9520 #ifdef HAVE_FDATASYNC
9521                 case SYNC_METHOD_FDATASYNC:
9522                         if (pg_fdatasync(fd) != 0)
9523                                 ereport(PANIC,
9524                                                 (errcode_for_file_access(),
9525                                                  errmsg("could not fdatasync log file %s: %m",
9526                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9527                         break;
9528 #endif
9529                 case SYNC_METHOD_OPEN:
9530                 case SYNC_METHOD_OPEN_DSYNC:
9531                         /* write synced it already */
9532                         break;
9533                 default:
9534                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9535                         break;
9536         }
9537 }
9538
9539 /*
9540  * Return the filename of given log segment, as a palloc'd string.
9541  */
9542 char *
9543 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
9544 {
9545         char       *result = palloc(MAXFNAMELEN);
9546
9547         XLogFileName(result, tli, segno);
9548         return result;
9549 }
9550
9551 /*
9552  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9553  * function. It creates the necessary starting checkpoint and constructs the
9554  * backup label file.
9555  *
9556  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9557  * backup is started with pg_start_backup(), and there can be only one active
9558  * at a time. The backup label file of an exclusive backup is written to
9559  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9560  *
9561  * A non-exclusive backup is used for the streaming base backups (see
9562  * src/backend/replication/basebackup.c). The difference to exclusive backups
9563  * is that the backup label file is not written to disk. Instead, its would-be
9564  * contents are returned in *labelfile, and the caller is responsible for
9565  * including it in the backup archive as 'backup_label'. There can be many
9566  * non-exclusive backups active at the same time, and they don't conflict
9567  * with an exclusive backup either.
9568  *
9569  * Returns the minimum WAL position that must be present to restore from this
9570  * backup, and the corresponding timeline ID in *starttli_p.
9571  *
9572  * Every successfully started non-exclusive backup must be stopped by calling
9573  * do_pg_stop_backup() or do_pg_abort_backup().
9574  *
9575  * It is the responsibility of the caller of this function to verify the
9576  * permissions of the calling user!
9577  */
9578 XLogRecPtr
9579 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
9580                                    char **labelfile)
9581 {
9582         bool            exclusive = (labelfile == NULL);
9583         bool            backup_started_in_recovery = false;
9584         XLogRecPtr      checkpointloc;
9585         XLogRecPtr      startpoint;
9586         TimeLineID      starttli;
9587         pg_time_t       stamp_time;
9588         char            strfbuf[128];
9589         char            xlogfilename[MAXFNAMELEN];
9590         XLogSegNo       _logSegNo;
9591         struct stat stat_buf;
9592         FILE       *fp;
9593         StringInfoData labelfbuf;
9594
9595         backup_started_in_recovery = RecoveryInProgress();
9596
9597         /*
9598          * Currently only non-exclusive backup can be taken during recovery.
9599          */
9600         if (backup_started_in_recovery && exclusive)
9601                 ereport(ERROR,
9602                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9603                                  errmsg("recovery is in progress"),
9604                                  errhint("WAL control functions cannot be executed during recovery.")));
9605
9606         /*
9607          * During recovery, we don't need to check WAL level. Because, if WAL
9608          * level is not sufficient, it's impossible to get here during recovery.
9609          */
9610         if (!backup_started_in_recovery && !XLogIsNeeded())
9611                 ereport(ERROR,
9612                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9613                           errmsg("WAL level not sufficient for making an online backup"),
9614                                  errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
9615
9616         if (strlen(backupidstr) > MAXPGPATH)
9617                 ereport(ERROR,
9618                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9619                                  errmsg("backup label too long (max %d bytes)",
9620                                                 MAXPGPATH)));
9621
9622         /*
9623          * Mark backup active in shared memory.  We must do full-page WAL writes
9624          * during an on-line backup even if not doing so at other times, because
9625          * it's quite possible for the backup dump to obtain a "torn" (partially
9626          * written) copy of a database page if it reads the page concurrently with
9627          * our write to the same page.  This can be fixed as long as the first
9628          * write to the page in the WAL sequence is a full-page write. Hence, we
9629          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9630          * are no dirty pages in shared memory that might get dumped while the
9631          * backup is in progress without having a corresponding WAL record.  (Once
9632          * the backup is complete, we need not force full-page writes anymore,
9633          * since we expect that any pages not modified during the backup interval
9634          * must have been correctly captured by the backup.)
9635          *
9636          * Note that forcePageWrites has no effect during an online backup from
9637          * the standby.
9638          *
9639          * We must hold all the insertion locks to change the value of
9640          * forcePageWrites, to ensure adequate interlocking against XLogInsert().
9641          */
9642         WALInsertLockAcquireExclusive();
9643         if (exclusive)
9644         {
9645                 if (XLogCtl->Insert.exclusiveBackup)
9646                 {
9647                         WALInsertLockRelease();
9648                         ereport(ERROR,
9649                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9650                                          errmsg("a backup is already in progress"),
9651                                          errhint("Run pg_stop_backup() and try again.")));
9652                 }
9653                 XLogCtl->Insert.exclusiveBackup = true;
9654         }
9655         else
9656                 XLogCtl->Insert.nonExclusiveBackups++;
9657         XLogCtl->Insert.forcePageWrites = true;
9658         WALInsertLockRelease();
9659
9660         /* Ensure we release forcePageWrites if fail below */
9661         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9662         {
9663                 bool            gotUniqueStartpoint = false;
9664
9665                 /*
9666                  * Force an XLOG file switch before the checkpoint, to ensure that the
9667                  * WAL segment the checkpoint is written to doesn't contain pages with
9668                  * old timeline IDs.  That would otherwise happen if you called
9669                  * pg_start_backup() right after restoring from a PITR archive: the
9670                  * first WAL segment containing the startup checkpoint has pages in
9671                  * the beginning with the old timeline ID.      That can cause trouble at
9672                  * recovery: we won't have a history file covering the old timeline if
9673                  * pg_xlog directory was not included in the base backup and the WAL
9674                  * archive was cleared too before starting the backup.
9675                  *
9676                  * This also ensures that we have emitted a WAL page header that has
9677                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9678                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
9679                  * compress out removable backup blocks, it won't remove any that
9680                  * occur after this point.
9681                  *
9682                  * During recovery, we skip forcing XLOG file switch, which means that
9683                  * the backup taken during recovery is not available for the special
9684                  * recovery case described above.
9685                  */
9686                 if (!backup_started_in_recovery)
9687                         RequestXLogSwitch();
9688
9689                 do
9690                 {
9691                         bool            checkpointfpw;
9692
9693                         /*
9694                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
9695                          * page problems, this guarantees that two successive backup runs
9696                          * will have different checkpoint positions and hence different
9697                          * history file names, even if nothing happened in between.
9698                          *
9699                          * During recovery, establish a restartpoint if possible. We use
9700                          * the last restartpoint as the backup starting checkpoint. This
9701                          * means that two successive backup runs can have same checkpoint
9702                          * positions.
9703                          *
9704                          * Since the fact that we are executing do_pg_start_backup()
9705                          * during recovery means that checkpointer is running, we can use
9706                          * RequestCheckpoint() to establish a restartpoint.
9707                          *
9708                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
9709                          * passing fast = true).  Otherwise this can take awhile.
9710                          */
9711                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9712                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
9713
9714                         /*
9715                          * Now we need to fetch the checkpoint record location, and also
9716                          * its REDO pointer.  The oldest point in WAL that would be needed
9717                          * to restore starting from the checkpoint is precisely the REDO
9718                          * pointer.
9719                          */
9720                         LWLockAcquire(ControlFileLock, LW_SHARED);
9721                         checkpointloc = ControlFile->checkPoint;
9722                         startpoint = ControlFile->checkPointCopy.redo;
9723                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
9724                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9725                         LWLockRelease(ControlFileLock);
9726
9727                         if (backup_started_in_recovery)
9728                         {
9729                                 /* use volatile pointer to prevent code rearrangement */
9730                                 volatile XLogCtlData *xlogctl = XLogCtl;
9731                                 XLogRecPtr      recptr;
9732
9733                                 /*
9734                                  * Check to see if all WAL replayed during online backup
9735                                  * (i.e., since last restartpoint used as backup starting
9736                                  * checkpoint) contain full-page writes.
9737                                  */
9738                                 SpinLockAcquire(&xlogctl->info_lck);
9739                                 recptr = xlogctl->lastFpwDisableRecPtr;
9740                                 SpinLockRelease(&xlogctl->info_lck);
9741
9742                                 if (!checkpointfpw || startpoint <= recptr)
9743                                         ereport(ERROR,
9744                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9745                                                    errmsg("WAL generated with full_page_writes=off was replayed "
9746                                                                   "since last restartpoint"),
9747                                                    errhint("This means that the backup being taken on the standby "
9748                                                                    "is corrupt and should not be used. "
9749                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
9750                                                                    "and then try an online backup again.")));
9751
9752                                 /*
9753                                  * During recovery, since we don't use the end-of-backup WAL
9754                                  * record and don't write the backup history file, the
9755                                  * starting WAL location doesn't need to be unique. This means
9756                                  * that two base backups started at the same time might use
9757                                  * the same checkpoint as starting locations.
9758                                  */
9759                                 gotUniqueStartpoint = true;
9760                         }
9761
9762                         /*
9763                          * If two base backups are started at the same time (in WAL sender
9764                          * processes), we need to make sure that they use different
9765                          * checkpoints as starting locations, because we use the starting
9766                          * WAL location as a unique identifier for the base backup in the
9767                          * end-of-backup WAL record and when we write the backup history
9768                          * file. Perhaps it would be better generate a separate unique ID
9769                          * for each backup instead of forcing another checkpoint, but
9770                          * taking a checkpoint right after another is not that expensive
9771                          * either because only few buffers have been dirtied yet.
9772                          */
9773                         WALInsertLockAcquireExclusive();
9774                         if (XLogCtl->Insert.lastBackupStart < startpoint)
9775                         {
9776                                 XLogCtl->Insert.lastBackupStart = startpoint;
9777                                 gotUniqueStartpoint = true;
9778                         }
9779                         WALInsertLockRelease();
9780                 } while (!gotUniqueStartpoint);
9781
9782                 XLByteToSeg(startpoint, _logSegNo);
9783                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
9784
9785                 /*
9786                  * Construct backup label file
9787                  */
9788                 initStringInfo(&labelfbuf);
9789
9790                 /* Use the log timezone here, not the session timezone */
9791                 stamp_time = (pg_time_t) time(NULL);
9792                 pg_strftime(strfbuf, sizeof(strfbuf),
9793                                         "%Y-%m-%d %H:%M:%S %Z",
9794                                         pg_localtime(&stamp_time, log_timezone));
9795                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
9796                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
9797                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
9798                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
9799                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
9800                                                  exclusive ? "pg_start_backup" : "streamed");
9801                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
9802                                                  backup_started_in_recovery ? "standby" : "master");
9803                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
9804                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
9805
9806                 /*
9807                  * Okay, write the file, or return its contents to caller.
9808                  */
9809                 if (exclusive)
9810                 {
9811                         /*
9812                          * Check for existing backup label --- implies a backup is already
9813                          * running.  (XXX given that we checked exclusiveBackup above,
9814                          * maybe it would be OK to just unlink any such label file?)
9815                          */
9816                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
9817                         {
9818                                 if (errno != ENOENT)
9819                                         ereport(ERROR,
9820                                                         (errcode_for_file_access(),
9821                                                          errmsg("could not stat file \"%s\": %m",
9822                                                                         BACKUP_LABEL_FILE)));
9823                         }
9824                         else
9825                                 ereport(ERROR,
9826                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9827                                                  errmsg("a backup is already in progress"),
9828                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
9829                                                                  BACKUP_LABEL_FILE)));
9830
9831                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
9832
9833                         if (!fp)
9834                                 ereport(ERROR,
9835                                                 (errcode_for_file_access(),
9836                                                  errmsg("could not create file \"%s\": %m",
9837                                                                 BACKUP_LABEL_FILE)));
9838                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
9839                                 fflush(fp) != 0 ||
9840                                 pg_fsync(fileno(fp)) != 0 ||
9841                                 ferror(fp) ||
9842                                 FreeFile(fp))
9843                                 ereport(ERROR,
9844                                                 (errcode_for_file_access(),
9845                                                  errmsg("could not write file \"%s\": %m",
9846                                                                 BACKUP_LABEL_FILE)));
9847                         pfree(labelfbuf.data);
9848                 }
9849                 else
9850                         *labelfile = labelfbuf.data;
9851         }
9852         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9853
9854         /*
9855          * We're done.  As a convenience, return the starting WAL location.
9856          */
9857         if (starttli_p)
9858                 *starttli_p = starttli;
9859         return startpoint;
9860 }
9861
9862 /* Error cleanup callback for pg_start_backup */
9863 static void
9864 pg_start_backup_callback(int code, Datum arg)
9865 {
9866         bool            exclusive = DatumGetBool(arg);
9867
9868         /* Update backup counters and forcePageWrites on failure */
9869         WALInsertLockAcquireExclusive();
9870         if (exclusive)
9871         {
9872                 Assert(XLogCtl->Insert.exclusiveBackup);
9873                 XLogCtl->Insert.exclusiveBackup = false;
9874         }
9875         else
9876         {
9877                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9878                 XLogCtl->Insert.nonExclusiveBackups--;
9879         }
9880
9881         if (!XLogCtl->Insert.exclusiveBackup &&
9882                 XLogCtl->Insert.nonExclusiveBackups == 0)
9883         {
9884                 XLogCtl->Insert.forcePageWrites = false;
9885         }
9886         WALInsertLockRelease();
9887 }
9888
9889 /*
9890  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
9891  * function.
9892
9893  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
9894  * the non-exclusive backup specified by 'labelfile'.
9895  *
9896  * Returns the last WAL position that must be present to restore from this
9897  * backup, and the corresponding timeline ID in *stoptli_p.
9898  *
9899  * It is the responsibility of the caller of this function to verify the
9900  * permissions of the calling user!
9901  */
9902 XLogRecPtr
9903 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
9904 {
9905         bool            exclusive = (labelfile == NULL);
9906         bool            backup_started_in_recovery = false;
9907         XLogRecPtr      startpoint;
9908         XLogRecPtr      stoppoint;
9909         TimeLineID      stoptli;
9910         XLogRecData rdata;
9911         pg_time_t       stamp_time;
9912         char            strfbuf[128];
9913         char            histfilepath[MAXPGPATH];
9914         char            startxlogfilename[MAXFNAMELEN];
9915         char            stopxlogfilename[MAXFNAMELEN];
9916         char            lastxlogfilename[MAXFNAMELEN];
9917         char            histfilename[MAXFNAMELEN];
9918         char            backupfrom[20];
9919         XLogSegNo       _logSegNo;
9920         FILE       *lfp;
9921         FILE       *fp;
9922         char            ch;
9923         int                     seconds_before_warning;
9924         int                     waits = 0;
9925         bool            reported_waiting = false;
9926         char       *remaining;
9927         char       *ptr;
9928         uint32          hi,
9929                                 lo;
9930
9931         backup_started_in_recovery = RecoveryInProgress();
9932
9933         /*
9934          * Currently only non-exclusive backup can be taken during recovery.
9935          */
9936         if (backup_started_in_recovery && exclusive)
9937                 ereport(ERROR,
9938                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9939                                  errmsg("recovery is in progress"),
9940                                  errhint("WAL control functions cannot be executed during recovery.")));
9941
9942         /*
9943          * During recovery, we don't need to check WAL level. Because, if WAL
9944          * level is not sufficient, it's impossible to get here during recovery.
9945          */
9946         if (!backup_started_in_recovery && !XLogIsNeeded())
9947                 ereport(ERROR,
9948                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9949                           errmsg("WAL level not sufficient for making an online backup"),
9950                                  errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
9951
9952         /*
9953          * OK to update backup counters and forcePageWrites
9954          */
9955         WALInsertLockAcquireExclusive();
9956         if (exclusive)
9957                 XLogCtl->Insert.exclusiveBackup = false;
9958         else
9959         {
9960                 /*
9961                  * The user-visible pg_start/stop_backup() functions that operate on
9962                  * exclusive backups can be called at any time, but for non-exclusive
9963                  * backups, it is expected that each do_pg_start_backup() call is
9964                  * matched by exactly one do_pg_stop_backup() call.
9965                  */
9966                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9967                 XLogCtl->Insert.nonExclusiveBackups--;
9968         }
9969
9970         if (!XLogCtl->Insert.exclusiveBackup &&
9971                 XLogCtl->Insert.nonExclusiveBackups == 0)
9972         {
9973                 XLogCtl->Insert.forcePageWrites = false;
9974         }
9975         WALInsertLockRelease();
9976
9977         if (exclusive)
9978         {
9979                 /*
9980                  * Read the existing label file into memory.
9981                  */
9982                 struct stat statbuf;
9983                 int                     r;
9984
9985                 if (stat(BACKUP_LABEL_FILE, &statbuf))
9986                 {
9987                         if (errno != ENOENT)
9988                                 ereport(ERROR,
9989                                                 (errcode_for_file_access(),
9990                                                  errmsg("could not stat file \"%s\": %m",
9991                                                                 BACKUP_LABEL_FILE)));
9992                         ereport(ERROR,
9993                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9994                                          errmsg("a backup is not in progress")));
9995                 }
9996
9997                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9998                 if (!lfp)
9999                 {
10000                         ereport(ERROR,
10001                                         (errcode_for_file_access(),
10002                                          errmsg("could not read file \"%s\": %m",
10003                                                         BACKUP_LABEL_FILE)));
10004                 }
10005                 labelfile = palloc(statbuf.st_size + 1);
10006                 r = fread(labelfile, statbuf.st_size, 1, lfp);
10007                 labelfile[statbuf.st_size] = '\0';
10008
10009                 /*
10010                  * Close and remove the backup label file
10011                  */
10012                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
10013                         ereport(ERROR,
10014                                         (errcode_for_file_access(),
10015                                          errmsg("could not read file \"%s\": %m",
10016                                                         BACKUP_LABEL_FILE)));
10017                 if (unlink(BACKUP_LABEL_FILE) != 0)
10018                         ereport(ERROR,
10019                                         (errcode_for_file_access(),
10020                                          errmsg("could not remove file \"%s\": %m",
10021                                                         BACKUP_LABEL_FILE)));
10022         }
10023
10024         /*
10025          * Read and parse the START WAL LOCATION line (this code is pretty crude,
10026          * but we are not expecting any variability in the file format).
10027          */
10028         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10029                            &hi, &lo, startxlogfilename,
10030                            &ch) != 4 || ch != '\n')
10031                 ereport(ERROR,
10032                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10033                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10034         startpoint = ((uint64) hi) << 32 | lo;
10035         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
10036
10037         /*
10038          * Parse the BACKUP FROM line. If we are taking an online backup from the
10039          * standby, we confirm that the standby has not been promoted during the
10040          * backup.
10041          */
10042         ptr = strstr(remaining, "BACKUP FROM:");
10043         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10044                 ereport(ERROR,
10045                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10046                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10047         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10048                 ereport(ERROR,
10049                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10050                                  errmsg("the standby was promoted during online backup"),
10051                                  errhint("This means that the backup being taken is corrupt "
10052                                                  "and should not be used. "
10053                                                  "Try taking another online backup.")));
10054
10055         /*
10056          * During recovery, we don't write an end-of-backup record. We assume that
10057          * pg_control was backed up last and its minimum recovery point can be
10058          * available as the backup end location. Since we don't have an
10059          * end-of-backup record, we use the pg_control value to check whether
10060          * we've reached the end of backup when starting recovery from this
10061          * backup. We have no way of checking if pg_control wasn't backed up last
10062          * however.
10063          *
10064          * We don't force a switch to new WAL file and wait for all the required
10065          * files to be archived. This is okay if we use the backup to start the
10066          * standby. But, if it's for an archive recovery, to ensure all the
10067          * required files are available, a user should wait for them to be
10068          * archived, or include them into the backup.
10069          *
10070          * We return the current minimum recovery point as the backup end
10071          * location. Note that it can be greater than the exact backup end
10072          * location if the minimum recovery point is updated after the backup of
10073          * pg_control. This is harmless for current uses.
10074          *
10075          * XXX currently a backup history file is for informational and debug
10076          * purposes only. It's not essential for an online backup. Furthermore,
10077          * even if it's created, it will not be archived during recovery because
10078          * an archiver is not invoked. So it doesn't seem worthwhile to write a
10079          * backup history file during recovery.
10080          */
10081         if (backup_started_in_recovery)
10082         {
10083                 /* use volatile pointer to prevent code rearrangement */
10084                 volatile XLogCtlData *xlogctl = XLogCtl;
10085                 XLogRecPtr      recptr;
10086
10087                 /*
10088                  * Check to see if all WAL replayed during online backup contain
10089                  * full-page writes.
10090                  */
10091                 SpinLockAcquire(&xlogctl->info_lck);
10092                 recptr = xlogctl->lastFpwDisableRecPtr;
10093                 SpinLockRelease(&xlogctl->info_lck);
10094
10095                 if (startpoint <= recptr)
10096                         ereport(ERROR,
10097                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10098                            errmsg("WAL generated with full_page_writes=off was replayed "
10099                                           "during online backup"),
10100                          errhint("This means that the backup being taken on the standby "
10101                                          "is corrupt and should not be used. "
10102                                  "Enable full_page_writes and run CHECKPOINT on the master, "
10103                                          "and then try an online backup again.")));
10104
10105
10106                 LWLockAcquire(ControlFileLock, LW_SHARED);
10107                 stoppoint = ControlFile->minRecoveryPoint;
10108                 stoptli = ControlFile->minRecoveryPointTLI;
10109                 LWLockRelease(ControlFileLock);
10110
10111                 if (stoptli_p)
10112                         *stoptli_p = stoptli;
10113                 return stoppoint;
10114         }
10115
10116         /*
10117          * Write the backup-end xlog record
10118          */
10119         rdata.data = (char *) (&startpoint);
10120         rdata.len = sizeof(startpoint);
10121         rdata.buffer = InvalidBuffer;
10122         rdata.next = NULL;
10123         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
10124         stoptli = ThisTimeLineID;
10125
10126         /*
10127          * Force a switch to a new xlog segment file, so that the backup is valid
10128          * as soon as archiver moves out the current segment file.
10129          */
10130         RequestXLogSwitch();
10131
10132         XLByteToPrevSeg(stoppoint, _logSegNo);
10133         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10134
10135         /* Use the log timezone here, not the session timezone */
10136         stamp_time = (pg_time_t) time(NULL);
10137         pg_strftime(strfbuf, sizeof(strfbuf),
10138                                 "%Y-%m-%d %H:%M:%S %Z",
10139                                 pg_localtime(&stamp_time, log_timezone));
10140
10141         /*
10142          * Write the backup history file
10143          */
10144         XLByteToSeg(startpoint, _logSegNo);
10145         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10146                                                   (uint32) (startpoint % XLogSegSize));
10147         fp = AllocateFile(histfilepath, "w");
10148         if (!fp)
10149                 ereport(ERROR,
10150                                 (errcode_for_file_access(),
10151                                  errmsg("could not create file \"%s\": %m",
10152                                                 histfilepath)));
10153         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10154                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10155         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10156                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10157         /* transfer remaining lines from label to history file */
10158         fprintf(fp, "%s", remaining);
10159         fprintf(fp, "STOP TIME: %s\n", strfbuf);
10160         if (fflush(fp) || ferror(fp) || FreeFile(fp))
10161                 ereport(ERROR,
10162                                 (errcode_for_file_access(),
10163                                  errmsg("could not write file \"%s\": %m",
10164                                                 histfilepath)));
10165
10166         /*
10167          * Clean out any no-longer-needed history files.  As a side effect, this
10168          * will post a .ready file for the newly created history file, notifying
10169          * the archiver that history file may be archived immediately.
10170          */
10171         CleanupBackupHistory();
10172
10173         /*
10174          * If archiving is enabled, wait for all the required WAL files to be
10175          * archived before returning. If archiving isn't enabled, the required WAL
10176          * needs to be transported via streaming replication (hopefully with
10177          * wal_keep_segments set high enough), or some more exotic mechanism like
10178          * polling and copying files from pg_xlog with script. We have no
10179          * knowledge of those mechanisms, so it's up to the user to ensure that he
10180          * gets all the required WAL.
10181          *
10182          * We wait until both the last WAL file filled during backup and the
10183          * history file have been archived, and assume that the alphabetic sorting
10184          * property of the WAL files ensures any earlier WAL files are safely
10185          * archived as well.
10186          *
10187          * We wait forever, since archive_command is supposed to work and we
10188          * assume the admin wanted his backup to work completely. If you don't
10189          * wish to wait, you can set statement_timeout.  Also, some notices are
10190          * issued to clue in anyone who might be doing this interactively.
10191          */
10192         if (waitforarchive && XLogArchivingActive())
10193         {
10194                 XLByteToPrevSeg(stoppoint, _logSegNo);
10195                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10196
10197                 XLByteToSeg(startpoint, _logSegNo);
10198                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10199                                                           (uint32) (startpoint % XLogSegSize));
10200
10201                 seconds_before_warning = 60;
10202                 waits = 0;
10203
10204                 while (XLogArchiveIsBusy(lastxlogfilename) ||
10205                            XLogArchiveIsBusy(histfilename))
10206                 {
10207                         CHECK_FOR_INTERRUPTS();
10208
10209                         if (!reported_waiting && waits > 5)
10210                         {
10211                                 ereport(NOTICE,
10212                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10213                                 reported_waiting = true;
10214                         }
10215
10216                         pg_usleep(1000000L);
10217
10218                         if (++waits >= seconds_before_warning)
10219                         {
10220                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
10221                                 ereport(WARNING,
10222                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10223                                                                 waits),
10224                                                  errhint("Check that your archive_command is executing properly.  "
10225                                                                  "pg_stop_backup can be canceled safely, "
10226                                                                  "but the database backup will not be usable without all the WAL segments.")));
10227                         }
10228                 }
10229
10230                 ereport(NOTICE,
10231                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
10232         }
10233         else if (waitforarchive)
10234                 ereport(NOTICE,
10235                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10236
10237         /*
10238          * We're done.  As a convenience, return the ending WAL location.
10239          */
10240         if (stoptli_p)
10241                 *stoptli_p = stoptli;
10242         return stoppoint;
10243 }
10244
10245
10246 /*
10247  * do_pg_abort_backup: abort a running backup
10248  *
10249  * This does just the most basic steps of do_pg_stop_backup(), by taking the
10250  * system out of backup mode, thus making it a lot more safe to call from
10251  * an error handler.
10252  *
10253  * NB: This is only for aborting a non-exclusive backup that doesn't write
10254  * backup_label. A backup started with pg_stop_backup() needs to be finished
10255  * with pg_stop_backup().
10256  */
10257 void
10258 do_pg_abort_backup(void)
10259 {
10260         WALInsertLockAcquireExclusive();
10261         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10262         XLogCtl->Insert.nonExclusiveBackups--;
10263
10264         if (!XLogCtl->Insert.exclusiveBackup &&
10265                 XLogCtl->Insert.nonExclusiveBackups == 0)
10266         {
10267                 XLogCtl->Insert.forcePageWrites = false;
10268         }
10269         WALInsertLockRelease();
10270 }
10271
10272 /*
10273  * Get latest redo apply position.
10274  *
10275  * Exported to allow WALReceiver to read the pointer directly.
10276  */
10277 XLogRecPtr
10278 GetXLogReplayRecPtr(TimeLineID *replayTLI)
10279 {
10280         /* use volatile pointer to prevent code rearrangement */
10281         volatile XLogCtlData *xlogctl = XLogCtl;
10282         XLogRecPtr      recptr;
10283         TimeLineID      tli;
10284
10285         SpinLockAcquire(&xlogctl->info_lck);
10286         recptr = xlogctl->lastReplayedEndRecPtr;
10287         tli = xlogctl->lastReplayedTLI;
10288         SpinLockRelease(&xlogctl->info_lck);
10289
10290         if (replayTLI)
10291                 *replayTLI = tli;
10292         return recptr;
10293 }
10294
10295 /*
10296  * Get latest WAL insert pointer
10297  */
10298 XLogRecPtr
10299 GetXLogInsertRecPtr(void)
10300 {
10301         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
10302         uint64          current_bytepos;
10303
10304         SpinLockAcquire(&Insert->insertpos_lck);
10305         current_bytepos = Insert->CurrBytePos;
10306         SpinLockRelease(&Insert->insertpos_lck);
10307
10308         return XLogBytePosToRecPtr(current_bytepos);
10309 }
10310
10311 /*
10312  * Get latest WAL write pointer
10313  */
10314 XLogRecPtr
10315 GetXLogWriteRecPtr(void)
10316 {
10317         {
10318                 /* use volatile pointer to prevent code rearrangement */
10319                 volatile XLogCtlData *xlogctl = XLogCtl;
10320
10321                 SpinLockAcquire(&xlogctl->info_lck);
10322                 LogwrtResult = xlogctl->LogwrtResult;
10323                 SpinLockRelease(&xlogctl->info_lck);
10324         }
10325
10326         return LogwrtResult.Write;
10327 }
10328
10329 /*
10330  * Returns the redo pointer of the last checkpoint or restartpoint. This is
10331  * the oldest point in WAL that we still need, if we have to restart recovery.
10332  */
10333 void
10334 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
10335 {
10336         LWLockAcquire(ControlFileLock, LW_SHARED);
10337         *oldrecptr = ControlFile->checkPointCopy.redo;
10338         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
10339         LWLockRelease(ControlFileLock);
10340 }
10341
10342 /*
10343  * read_backup_label: check to see if a backup_label file is present
10344  *
10345  * If we see a backup_label during recovery, we assume that we are recovering
10346  * from a backup dump file, and we therefore roll forward from the checkpoint
10347  * identified by the label file, NOT what pg_control says.      This avoids the
10348  * problem that pg_control might have been archived one or more checkpoints
10349  * later than the start of the dump, and so if we rely on it as the start
10350  * point, we will fail to restore a consistent database state.
10351  *
10352  * Returns TRUE if a backup_label was found (and fills the checkpoint
10353  * location and its REDO location into *checkPointLoc and RedoStartLSN,
10354  * respectively); returns FALSE if not. If this backup_label came from a
10355  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
10356  * was created during recovery, *backupFromStandby is set to TRUE.
10357  */
10358 static bool
10359 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
10360                                   bool *backupFromStandby)
10361 {
10362         char            startxlogfilename[MAXFNAMELEN];
10363         TimeLineID      tli;
10364         FILE       *lfp;
10365         char            ch;
10366         char            backuptype[20];
10367         char            backupfrom[20];
10368         uint32          hi,
10369                                 lo;
10370
10371         *backupEndRequired = false;
10372         *backupFromStandby = false;
10373
10374         /*
10375          * See if label file is present
10376          */
10377         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10378         if (!lfp)
10379         {
10380                 if (errno != ENOENT)
10381                         ereport(FATAL,
10382                                         (errcode_for_file_access(),
10383                                          errmsg("could not read file \"%s\": %m",
10384                                                         BACKUP_LABEL_FILE)));
10385                 return false;                   /* it's not there, all is fine */
10386         }
10387
10388         /*
10389          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
10390          * is pretty crude, but we are not expecting any variability in the file
10391          * format).
10392          */
10393         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
10394                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
10395                 ereport(FATAL,
10396                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10397                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10398         RedoStartLSN = ((uint64) hi) << 32 | lo;
10399         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
10400                            &hi, &lo, &ch) != 3 || ch != '\n')
10401                 ereport(FATAL,
10402                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10403                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10404         *checkPointLoc = ((uint64) hi) << 32 | lo;
10405
10406         /*
10407          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
10408          * from an older backup anyway, but since the information on it is not
10409          * strictly required, don't error out if it's missing for some reason.
10410          */
10411         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
10412         {
10413                 if (strcmp(backuptype, "streamed") == 0)
10414                         *backupEndRequired = true;
10415         }
10416
10417         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
10418         {
10419                 if (strcmp(backupfrom, "standby") == 0)
10420                         *backupFromStandby = true;
10421         }
10422
10423         if (ferror(lfp) || FreeFile(lfp))
10424                 ereport(FATAL,
10425                                 (errcode_for_file_access(),
10426                                  errmsg("could not read file \"%s\": %m",
10427                                                 BACKUP_LABEL_FILE)));
10428
10429         return true;
10430 }
10431
10432 /*
10433  * Error context callback for errors occurring during rm_redo().
10434  */
10435 static void
10436 rm_redo_error_callback(void *arg)
10437 {
10438         XLogRecord *record = (XLogRecord *) arg;
10439         StringInfoData buf;
10440
10441         initStringInfo(&buf);
10442         RmgrTable[record->xl_rmid].rm_desc(&buf,
10443                                                                            record->xl_info,
10444                                                                            XLogRecGetData(record));
10445
10446         /* don't bother emitting empty description */
10447         if (buf.len > 0)
10448                 errcontext("xlog redo %s", buf.data);
10449
10450         pfree(buf.data);
10451 }
10452
10453 /*
10454  * BackupInProgress: check if online backup mode is active
10455  *
10456  * This is done by checking for existence of the "backup_label" file.
10457  */
10458 bool
10459 BackupInProgress(void)
10460 {
10461         struct stat stat_buf;
10462
10463         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
10464 }
10465
10466 /*
10467  * CancelBackup: rename the "backup_label" file to cancel backup mode
10468  *
10469  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
10470  * Note that this will render an online backup in progress useless.
10471  * To correctly finish an online backup, pg_stop_backup must be called.
10472  */
10473 void
10474 CancelBackup(void)
10475 {
10476         struct stat stat_buf;
10477
10478         /* if the file is not there, return */
10479         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
10480                 return;
10481
10482         /* remove leftover file from previously canceled backup if it exists */
10483         unlink(BACKUP_LABEL_OLD);
10484
10485         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
10486         {
10487                 ereport(LOG,
10488                                 (errmsg("online backup mode canceled"),
10489                                  errdetail("\"%s\" was renamed to \"%s\".",
10490                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10491         }
10492         else
10493         {
10494                 ereport(WARNING,
10495                                 (errcode_for_file_access(),
10496                                  errmsg("online backup mode was not canceled"),
10497                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
10498                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10499         }
10500 }
10501
10502 /*
10503  * Read the XLOG page containing RecPtr into readBuf (if not read already).
10504  * Returns number of bytes read, if the page is read successfully, or -1
10505  * in case of errors.  When errors occur, they are ereport'ed, but only
10506  * if they have not been previously reported.
10507  *
10508  * This is responsible for restoring files from archive as needed, as well
10509  * as for waiting for the requested WAL record to arrive in standby mode.
10510  *
10511  * 'emode' specifies the log level used for reporting "file not found" or
10512  * "end of WAL" situations in archive recovery, or in standby mode when a
10513  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10514  * false in those situations, on higher log levels the ereport() won't
10515  * return.
10516  *
10517  * In standby mode, if after a successful return of XLogPageRead() the
10518  * caller finds the record it's interested in to be broken, it should
10519  * ereport the error with the level determined by
10520  * emode_for_corrupt_record(), and then set lastSourceFailed
10521  * and call XLogPageRead() again with the same arguments. This lets
10522  * XLogPageRead() to try fetching the record from another source, or to
10523  * sleep and retry.
10524  */
10525 static int
10526 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
10527                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
10528 {
10529         XLogPageReadPrivate *private =
10530         (XLogPageReadPrivate *) xlogreader->private_data;
10531         int                     emode = private->emode;
10532         uint32          targetPageOff;
10533         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
10534
10535         XLByteToSeg(targetPagePtr, targetSegNo);
10536         targetPageOff = targetPagePtr % XLogSegSize;
10537
10538         /*
10539          * See if we need to switch to a new segment because the requested record
10540          * is not in the currently open one.
10541          */
10542         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
10543         {
10544                 /*
10545                  * Request a restartpoint if we've replayed too much xlog since the
10546                  * last one.
10547                  */
10548                 if (StandbyModeRequested && bgwriterLaunched)
10549                 {
10550                         if (XLogCheckpointNeeded(readSegNo))
10551                         {
10552                                 (void) GetRedoRecPtr();
10553                                 if (XLogCheckpointNeeded(readSegNo))
10554                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10555                         }
10556                 }
10557
10558                 close(readFile);
10559                 readFile = -1;
10560                 readSource = 0;
10561         }
10562
10563         XLByteToSeg(targetPagePtr, readSegNo);
10564
10565 retry:
10566         /* See if we need to retrieve more data */
10567         if (readFile < 0 ||
10568                 (readSource == XLOG_FROM_STREAM &&
10569                  receivedUpto < targetPagePtr + reqLen))
10570         {
10571                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
10572                                                                                  private->randAccess,
10573                                                                                  private->fetching_ckpt,
10574                                                                                  targetRecPtr))
10575                 {
10576                         if (readFile >= 0)
10577                                 close(readFile);
10578                         readFile = -1;
10579                         readLen = 0;
10580                         readSource = 0;
10581
10582                         return -1;
10583                 }
10584         }
10585
10586         /*
10587          * At this point, we have the right segment open and if we're streaming we
10588          * know the requested record is in it.
10589          */
10590         Assert(readFile != -1);
10591
10592         /*
10593          * If the current segment is being streamed from master, calculate how
10594          * much of the current page we have received already. We know the
10595          * requested record has been received, but this is for the benefit of
10596          * future calls, to allow quick exit at the top of this function.
10597          */
10598         if (readSource == XLOG_FROM_STREAM)
10599         {
10600                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
10601                         readLen = XLOG_BLCKSZ;
10602                 else
10603                         readLen = receivedUpto % XLogSegSize - targetPageOff;
10604         }
10605         else
10606                 readLen = XLOG_BLCKSZ;
10607
10608         /* Read the requested page */
10609         readOff = targetPageOff;
10610         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10611         {
10612                 char            fname[MAXFNAMELEN];
10613
10614                 XLogFileName(fname, curFileTLI, readSegNo);
10615                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10616                                 (errcode_for_file_access(),
10617                                  errmsg("could not seek in log segment %s to offset %u: %m",
10618                                                 fname, readOff)));
10619                 goto next_record_is_invalid;
10620         }
10621
10622         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10623         {
10624                 char            fname[MAXFNAMELEN];
10625
10626                 XLogFileName(fname, curFileTLI, readSegNo);
10627                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10628                                 (errcode_for_file_access(),
10629                                  errmsg("could not read from log segment %s, offset %u: %m",
10630                                                 fname, readOff)));
10631                 goto next_record_is_invalid;
10632         }
10633
10634         Assert(targetSegNo == readSegNo);
10635         Assert(targetPageOff == readOff);
10636         Assert(reqLen <= readLen);
10637
10638         *readTLI = curFileTLI;
10639         return readLen;
10640
10641 next_record_is_invalid:
10642         lastSourceFailed = true;
10643
10644         if (readFile >= 0)
10645                 close(readFile);
10646         readFile = -1;
10647         readLen = 0;
10648         readSource = 0;
10649
10650         /* In standby-mode, keep trying */
10651         if (StandbyMode)
10652                 goto retry;
10653         else
10654                 return -1;
10655 }
10656
10657 /*
10658  * Open the WAL segment containing WAL position 'RecPtr'.
10659  *
10660  * The segment can be fetched via restore_command, or via walreceiver having
10661  * streamed the record, or it can already be present in pg_xlog. Checking
10662  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
10663  * too, in case someone copies a new segment directly to pg_xlog. That is not
10664  * documented or recommended, though.
10665  *
10666  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
10667  * prepare to read WAL starting from RedoStartLSN after this.
10668  *
10669  * 'RecPtr' might not point to the beginning of the record we're interested
10670  * in, it might also point to the page or segment header. In that case,
10671  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
10672  * used to decide which timeline to stream the requested WAL from.
10673  *
10674  * If the record is not immediately available, the function returns false
10675  * if we're not in standby mode. In standby mode, waits for it to become
10676  * available.
10677  *
10678  * When the requested record becomes available, the function opens the file
10679  * containing it (if not open already), and returns true. When end of standby
10680  * mode is triggered by the user, and there is no more WAL available, returns
10681  * false.
10682  */
10683 static bool
10684 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
10685                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
10686 {
10687         static pg_time_t last_fail_time = 0;
10688         pg_time_t       now;
10689
10690         /*-------
10691          * Standby mode is implemented by a state machine:
10692          *
10693          * 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE), or just
10694          *    pg_xlog (XLOG_FROM_XLOG)
10695          * 2. Check trigger file
10696          * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
10697          * 4. Rescan timelines
10698          * 5. Sleep 5 seconds, and loop back to 1.
10699          *
10700          * Failure to read from the current source advances the state machine to
10701          * the next state.
10702          *
10703          * 'currentSource' indicates the current state. There are no currentSource
10704          * values for "check trigger", "rescan timelines", and "sleep" states,
10705          * those actions are taken when reading from the previous source fails, as
10706          * part of advancing to the next state.
10707          *-------
10708          */
10709         if (!InArchiveRecovery)
10710                 currentSource = XLOG_FROM_PG_XLOG;
10711         else if (currentSource == 0)
10712                 currentSource = XLOG_FROM_ARCHIVE;
10713
10714         for (;;)
10715         {
10716                 int                     oldSource = currentSource;
10717
10718                 /*
10719                  * First check if we failed to read from the current source, and
10720                  * advance the state machine if so. The failure to read might've
10721                  * happened outside this function, e.g when a CRC check fails on a
10722                  * record, or within this loop.
10723                  */
10724                 if (lastSourceFailed)
10725                 {
10726                         switch (currentSource)
10727                         {
10728                                 case XLOG_FROM_ARCHIVE:
10729                                 case XLOG_FROM_PG_XLOG:
10730
10731                                         /*
10732                                          * Check to see if the trigger file exists. Note that we
10733                                          * do this only after failure, so when you create the
10734                                          * trigger file, we still finish replaying as much as we
10735                                          * can from archive and pg_xlog before failover.
10736                                          */
10737                                         if (StandbyMode && CheckForStandbyTrigger())
10738                                         {
10739                                                 ShutdownWalRcv();
10740                                                 return false;
10741                                         }
10742
10743                                         /*
10744                                          * Not in standby mode, and we've now tried the archive
10745                                          * and pg_xlog.
10746                                          */
10747                                         if (!StandbyMode)
10748                                                 return false;
10749
10750                                         /*
10751                                          * If primary_conninfo is set, launch walreceiver to try
10752                                          * to stream the missing WAL.
10753                                          *
10754                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
10755                                          * checkpoint location. In that case, we use RedoStartLSN
10756                                          * as the streaming start position instead of RecPtr, so
10757                                          * that when we later jump backwards to start redo at
10758                                          * RedoStartLSN, we will have the logs streamed already.
10759                                          */
10760                                         if (PrimaryConnInfo)
10761                                         {
10762                                                 XLogRecPtr      ptr;
10763                                                 TimeLineID      tli;
10764
10765                                                 if (fetching_ckpt)
10766                                                 {
10767                                                         ptr = RedoStartLSN;
10768                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
10769                                                 }
10770                                                 else
10771                                                 {
10772                                                         ptr = tliRecPtr;
10773                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
10774
10775                                                         if (curFileTLI > 0 && tli < curFileTLI)
10776                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
10777                                                                          (uint32) (ptr >> 32), (uint32) ptr,
10778                                                                          tli, curFileTLI);
10779                                                 }
10780                                                 curFileTLI = tli;
10781                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
10782                                                                                          PrimarySlotName);
10783                                                 receivedUpto = 0;
10784                                         }
10785
10786                                         /*
10787                                          * Move to XLOG_FROM_STREAM state in either case. We'll
10788                                          * get immediate failure if we didn't launch walreceiver,
10789                                          * and move on to the next state.
10790                                          */
10791                                         currentSource = XLOG_FROM_STREAM;
10792                                         break;
10793
10794                                 case XLOG_FROM_STREAM:
10795
10796                                         /*
10797                                          * Failure while streaming. Most likely, we got here
10798                                          * because streaming replication was terminated, or
10799                                          * promotion was triggered. But we also get here if we
10800                                          * find an invalid record in the WAL streamed from master,
10801                                          * in which case something is seriously wrong. There's
10802                                          * little chance that the problem will just go away, but
10803                                          * PANIC is not good for availability either, especially
10804                                          * in hot standby mode. So, we treat that the same as
10805                                          * disconnection, and retry from archive/pg_xlog again.
10806                                          * The WAL in the archive should be identical to what was
10807                                          * streamed, so it's unlikely that it helps, but one can
10808                                          * hope...
10809                                          */
10810
10811                                         /*
10812                                          * Before we leave XLOG_FROM_STREAM state, make sure that
10813                                          * walreceiver is not active, so that it won't overwrite
10814                                          * WAL that we restore from archive.
10815                                          */
10816                                         if (WalRcvStreaming())
10817                                                 ShutdownWalRcv();
10818
10819                                         /*
10820                                          * Before we sleep, re-scan for possible new timelines if
10821                                          * we were requested to recover to the latest timeline.
10822                                          */
10823                                         if (recoveryTargetIsLatest)
10824                                         {
10825                                                 if (rescanLatestTimeLine())
10826                                                 {
10827                                                         currentSource = XLOG_FROM_ARCHIVE;
10828                                                         break;
10829                                                 }
10830                                         }
10831
10832                                         /*
10833                                          * XLOG_FROM_STREAM is the last state in our state
10834                                          * machine, so we've exhausted all the options for
10835                                          * obtaining the requested WAL. We're going to loop back
10836                                          * and retry from the archive, but if it hasn't been long
10837                                          * since last attempt, sleep 5 seconds to avoid
10838                                          * busy-waiting.
10839                                          */
10840                                         now = (pg_time_t) time(NULL);
10841                                         if ((now - last_fail_time) < 5)
10842                                         {
10843                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
10844                                                 now = (pg_time_t) time(NULL);
10845                                         }
10846                                         last_fail_time = now;
10847                                         currentSource = XLOG_FROM_ARCHIVE;
10848                                         break;
10849
10850                                 default:
10851                                         elog(ERROR, "unexpected WAL source %d", currentSource);
10852                         }
10853                 }
10854                 else if (currentSource == XLOG_FROM_PG_XLOG)
10855                 {
10856                         /*
10857                          * We just successfully read a file in pg_xlog. We prefer files in
10858                          * the archive over ones in pg_xlog, so try the next file again
10859                          * from the archive first.
10860                          */
10861                         if (InArchiveRecovery)
10862                                 currentSource = XLOG_FROM_ARCHIVE;
10863                 }
10864
10865                 if (currentSource != oldSource)
10866                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
10867                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
10868                                  lastSourceFailed ? "failure" : "success");
10869
10870                 /*
10871                  * We've now handled possible failure. Try to read from the chosen
10872                  * source.
10873                  */
10874                 lastSourceFailed = false;
10875
10876                 switch (currentSource)
10877                 {
10878                         case XLOG_FROM_ARCHIVE:
10879                         case XLOG_FROM_PG_XLOG:
10880                                 /* Close any old file we might have open. */
10881                                 if (readFile >= 0)
10882                                 {
10883                                         close(readFile);
10884                                         readFile = -1;
10885                                 }
10886                                 /* Reset curFileTLI if random fetch. */
10887                                 if (randAccess)
10888                                         curFileTLI = 0;
10889
10890                                 /*
10891                                  * Try to restore the file from archive, or read an existing
10892                                  * file from pg_xlog.
10893                                  */
10894                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
10895                                                 currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
10896                                                                                  currentSource);
10897                                 if (readFile >= 0)
10898                                         return true;    /* success! */
10899
10900                                 /*
10901                                  * Nope, not found in archive or pg_xlog.
10902                                  */
10903                                 lastSourceFailed = true;
10904                                 break;
10905
10906                         case XLOG_FROM_STREAM:
10907                                 {
10908                                         bool            havedata;
10909
10910                                         /*
10911                                          * Check if WAL receiver is still active.
10912                                          */
10913                                         if (!WalRcvStreaming())
10914                                         {
10915                                                 lastSourceFailed = true;
10916                                                 break;
10917                                         }
10918
10919                                         /*
10920                                          * Walreceiver is active, so see if new data has arrived.
10921                                          *
10922                                          * We only advance XLogReceiptTime when we obtain fresh
10923                                          * WAL from walreceiver and observe that we had already
10924                                          * processed everything before the most recent "chunk"
10925                                          * that it flushed to disk.  In steady state where we are
10926                                          * keeping up with the incoming data, XLogReceiptTime will
10927                                          * be updated on each cycle. When we are behind,
10928                                          * XLogReceiptTime will not advance, so the grace time
10929                                          * allotted to conflicting queries will decrease.
10930                                          */
10931                                         if (RecPtr < receivedUpto)
10932                                                 havedata = true;
10933                                         else
10934                                         {
10935                                                 XLogRecPtr      latestChunkStart;
10936
10937                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
10938                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
10939                                                 {
10940                                                         havedata = true;
10941                                                         if (latestChunkStart <= RecPtr)
10942                                                         {
10943                                                                 XLogReceiptTime = GetCurrentTimestamp();
10944                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
10945                                                         }
10946                                                 }
10947                                                 else
10948                                                         havedata = false;
10949                                         }
10950                                         if (havedata)
10951                                         {
10952                                                 /*
10953                                                  * Great, streamed far enough.  Open the file if it's
10954                                                  * not open already.  Also read the timeline history
10955                                                  * file if we haven't initialized timeline history
10956                                                  * yet; it should be streamed over and present in
10957                                                  * pg_xlog by now.      Use XLOG_FROM_STREAM so that
10958                                                  * source info is set correctly and XLogReceiptTime
10959                                                  * isn't changed.
10960                                                  */
10961                                                 if (readFile < 0)
10962                                                 {
10963                                                         if (!expectedTLEs)
10964                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
10965                                                         readFile = XLogFileRead(readSegNo, PANIC,
10966                                                                                                         receiveTLI,
10967                                                                                                         XLOG_FROM_STREAM, false);
10968                                                         Assert(readFile >= 0);
10969                                                 }
10970                                                 else
10971                                                 {
10972                                                         /* just make sure source info is correct... */
10973                                                         readSource = XLOG_FROM_STREAM;
10974                                                         XLogReceiptSource = XLOG_FROM_STREAM;
10975                                                         return true;
10976                                                 }
10977                                                 break;
10978                                         }
10979
10980                                         /*
10981                                          * Data not here yet. Check for trigger, then wait for
10982                                          * walreceiver to wake us up when new WAL arrives.
10983                                          */
10984                                         if (CheckForStandbyTrigger())
10985                                         {
10986                                                 /*
10987                                                  * Note that we don't "return false" immediately here.
10988                                                  * After being triggered, we still want to replay all
10989                                                  * the WAL that was already streamed. It's in pg_xlog
10990                                                  * now, so we just treat this as a failure, and the
10991                                                  * state machine will move on to replay the streamed
10992                                                  * WAL from pg_xlog, and then recheck the trigger and
10993                                                  * exit replay.
10994                                                  */
10995                                                 lastSourceFailed = true;
10996                                                 break;
10997                                         }
10998
10999                                         /*
11000                                          * Wait for more WAL to arrive. Time out after 5 seconds,
11001                                          * like when polling the archive, to react to a trigger
11002                                          * file promptly.
11003                                          */
11004                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
11005                                                           WL_LATCH_SET | WL_TIMEOUT,
11006                                                           5000L);
11007                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
11008                                         break;
11009                                 }
11010
11011                         default:
11012                                 elog(ERROR, "unexpected WAL source %d", currentSource);
11013                 }
11014
11015                 /*
11016                  * This possibly-long loop needs to handle interrupts of startup
11017                  * process.
11018                  */
11019                 HandleStartupProcInterrupts();
11020         }
11021
11022         return false;   /* not reached */
11023 }
11024
11025 /*
11026  * Determine what log level should be used to report a corrupt WAL record
11027  * in the current WAL page, previously read by XLogPageRead().
11028  *
11029  * 'emode' is the error mode that would be used to report a file-not-found
11030  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
11031  * we're retrying the exact same record that we've tried previously, only
11032  * complain the first time to keep the noise down.      However, we only do when
11033  * reading from pg_xlog, because we don't expect any invalid records in archive
11034  * or in records streamed from master. Files in the archive should be complete,
11035  * and we should never hit the end of WAL because we stop and wait for more WAL
11036  * to arrive before replaying it.
11037  *
11038  * NOTE: This function remembers the RecPtr value it was last called with,
11039  * to suppress repeated messages about the same record. Only call this when
11040  * you are about to ereport(), or you might cause a later message to be
11041  * erroneously suppressed.
11042  */
11043 static int
11044 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
11045 {
11046         static XLogRecPtr lastComplaint = 0;
11047
11048         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
11049         {
11050                 if (RecPtr == lastComplaint)
11051                         emode = DEBUG1;
11052                 else
11053                         lastComplaint = RecPtr;
11054         }
11055         return emode;
11056 }
11057
11058 /*
11059  * Check to see whether the user-specified trigger file exists and whether a
11060  * promote request has arrived.  If either condition holds, return true.
11061  */
11062 static bool
11063 CheckForStandbyTrigger(void)
11064 {
11065         struct stat stat_buf;
11066         static bool triggered = false;
11067
11068         if (triggered)
11069                 return true;
11070
11071         if (IsPromoteTriggered())
11072         {
11073                 /*
11074                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
11075                  * signal handler. It now leaves the file in place and lets the
11076                  * Startup process do the unlink. This allows Startup to know whether
11077                  * it should create a full checkpoint before starting up (fallback
11078                  * mode). Fast promotion takes precedence.
11079                  */
11080                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11081                 {
11082                         unlink(PROMOTE_SIGNAL_FILE);
11083                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11084                         fast_promote = true;
11085                 }
11086                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11087                 {
11088                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11089                         fast_promote = false;
11090                 }
11091
11092                 ereport(LOG, (errmsg("received promote request")));
11093
11094                 ResetPromoteTriggered();
11095                 triggered = true;
11096                 return true;
11097         }
11098
11099         if (TriggerFile == NULL)
11100                 return false;
11101
11102         if (stat(TriggerFile, &stat_buf) == 0)
11103         {
11104                 ereport(LOG,
11105                                 (errmsg("trigger file found: %s", TriggerFile)));
11106                 unlink(TriggerFile);
11107                 triggered = true;
11108                 fast_promote = true;
11109                 return true;
11110         }
11111         return false;
11112 }
11113
11114 /*
11115  * Check to see if a promote request has arrived. Should be
11116  * called by postmaster after receiving SIGUSR1.
11117  */
11118 bool
11119 CheckPromoteSignal(void)
11120 {
11121         struct stat stat_buf;
11122
11123         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
11124                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11125                 return true;
11126
11127         return false;
11128 }
11129
11130 /*
11131  * Wake up startup process to replay newly arrived WAL, or to notice that
11132  * failover has been requested.
11133  */
11134 void
11135 WakeupRecovery(void)
11136 {
11137         SetLatch(&XLogCtl->recoveryWakeupLatch);
11138 }
11139
11140 /*
11141  * Update the WalWriterSleeping flag.
11142  */
11143 void
11144 SetWalWriterSleeping(bool sleeping)
11145 {
11146         /* use volatile pointer to prevent code rearrangement */
11147         volatile XLogCtlData *xlogctl = XLogCtl;
11148
11149         SpinLockAcquire(&xlogctl->info_lck);
11150         xlogctl->WalWriterSleeping = sleeping;
11151         SpinLockRelease(&xlogctl->info_lck);
11152 }