granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <time.h>
  19 #include <fcntl.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <unistd.h>
  23
  24 #include "access/clog.h"
  25 #include "access/commit_ts.h"
  26 #include "access/multixact.h"
  27 #include "access/rewriteheap.h"
  28 #include "access/subtrans.h"
  29 #include "access/timeline.h"
  30 #include "access/transam.h"
  31 #include "access/tuptoaster.h"
  32 #include "access/twophase.h"
  33 #include "access/xact.h"
  34 #include "access/xlog_internal.h"
  35 #include "access/xloginsert.h"
  36 #include "access/xlogreader.h"
  37 #include "access/xlogutils.h"
  38 #include "catalog/catversion.h"
  39 #include "catalog/pg_control.h"
  40 #include "catalog/pg_database.h"
  41 #include "miscadmin.h"
  42 #include "pgstat.h"
  43 #include "postmaster/bgwriter.h"
  44 #include "postmaster/startup.h"
  45 #include "replication/logical.h"
  46 #include "replication/slot.h"
  47 #include "replication/snapbuild.h"
  48 #include "replication/walreceiver.h"
  49 #include "replication/walsender.h"
  50 #include "storage/barrier.h"
  51 #include "storage/bufmgr.h"
  52 #include "storage/fd.h"
  53 #include "storage/ipc.h"
  54 #include "storage/large_object.h"
  55 #include "storage/latch.h"
  56 #include "storage/pmsignal.h"
  57 #include "storage/predicate.h"
  58 #include "storage/proc.h"
  59 #include "storage/procarray.h"
  60 #include "storage/reinit.h"
  61 #include "storage/smgr.h"
  62 #include "storage/spin.h"
  63 #include "utils/builtins.h"
  64 #include "utils/guc.h"
  65 #include "utils/memutils.h"
  66 #include "utils/ps_status.h"
  67 #include "utils/relmapper.h"
  68 #include "utils/snapmgr.h"
  69 #include "utils/timestamp.h"
  70 #include "pg_trace.h"
  71
  72 extern uint32 bootstrap_data_checksum_version;
  73
  74 /* File path names (all relative to $PGDATA) */
  75 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  76 #define RECOVERY_COMMAND_DONE   "recovery.done"
  77 #define PROMOTE_SIGNAL_FILE             "promote"
  78 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
  79
  80
  81 /* User-settable parameters */
  82 int                     CheckPointSegments = 3;
  83 int                     wal_keep_segments = 0;
  84 int                     XLOGbuffers = -1;
  85 int                     XLogArchiveTimeout = 0;
  86 bool            XLogArchiveMode = false;
  87 char       *XLogArchiveCommand = NULL;
  88 bool            EnableHotStandby = false;
  89 bool            fullPageWrites = true;
  90 bool            wal_log_hints = false;
  91 bool            log_checkpoints = false;
  92 int                     sync_method = DEFAULT_SYNC_METHOD;
  93 int                     wal_level = WAL_LEVEL_MINIMAL;
  94 int                     CommitDelay = 0;        /* precommit delay in microseconds */
  95 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  96
  97 #ifdef WAL_DEBUG
  98 bool            XLOG_DEBUG = false;
  99 #endif
 100
 101 /*
 102  * Number of WAL insertion locks to use. A higher value allows more insertions
 103  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
 104  * which needs to iterate all the locks.
 105  */
 106 #define NUM_XLOGINSERT_LOCKS  8
 107
 108 /*
 109  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
 110  * When we are done with an old XLOG segment file, we will recycle it as a
 111  * future XLOG segment as long as there aren't already XLOGfileslop future
 112  * segments; else we'll delete it.  This could be made a separate GUC
 113  * variable, but at present I think it's sufficient to hardwire it as
 114  * 2*CheckPointSegments+1.  Under normal conditions, a checkpoint will free
 115  * no more than 2*CheckPointSegments log segments, and we want to recycle all
 116  * of them; the +1 allows boundary cases to happen without wasting a
 117  * delete/create-segment cycle.
 118  */
 119 #define XLOGfileslop    (2*CheckPointSegments + 1)
 120
 121
 122 /*
 123  * GUC support
 124  */
 125 const struct config_enum_entry sync_method_options[] = {
 126         {"fsync", SYNC_METHOD_FSYNC, false},
 127 #ifdef HAVE_FSYNC_WRITETHROUGH
 128         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 129 #endif
 130 #ifdef HAVE_FDATASYNC
 131         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 132 #endif
 133 #ifdef OPEN_SYNC_FLAG
 134         {"open_sync", SYNC_METHOD_OPEN, false},
 135 #endif
 136 #ifdef OPEN_DATASYNC_FLAG
 137         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 138 #endif
 139         {NULL, 0, false}
 140 };
 141
 142 /*
 143  * Statistics for current checkpoint are collected in this global struct.
 144  * Because only the checkpointer or a stand-alone backend can perform
 145  * checkpoints, this will be unused in normal backends.
 146  */
 147 CheckpointStatsData CheckpointStats;
 148
 149 /*
 150  * ThisTimeLineID will be same in all backends --- it identifies current
 151  * WAL timeline for the database system.
 152  */
 153 TimeLineID      ThisTimeLineID = 0;
 154
 155 /*
 156  * Are we doing recovery from XLOG?
 157  *
 158  * This is only ever true in the startup process; it should be read as meaning
 159  * "this process is replaying WAL records", rather than "the system is in
 160  * recovery mode".  It should be examined primarily by functions that need
 161  * to act differently when called from a WAL redo function (e.g., to skip WAL
 162  * logging).  To check whether the system is in recovery regardless of which
 163  * process you're running in, use RecoveryInProgress() but only after shared
 164  * memory startup and lock initialization.
 165  */
 166 bool            InRecovery = false;
 167
 168 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 169 HotStandbyState standbyState = STANDBY_DISABLED;
 170
 171 static XLogRecPtr LastRec;
 172
 173 /* Local copy of WalRcv->receivedUpto */
 174 static XLogRecPtr receivedUpto = 0;
 175 static TimeLineID receiveTLI = 0;
 176
 177 /*
 178  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 179  * the replayed WAL records indicate. It's initialized with full_page_writes
 180  * that the recovery starting checkpoint record indicates, and then updated
 181  * each time XLOG_FPW_CHANGE record is replayed.
 182  */
 183 static bool lastFullPageWrites;
 184
 185 /*
 186  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 187  * known, need to check the shared state".
 188  */
 189 static bool LocalRecoveryInProgress = true;
 190
 191 /*
 192  * Local copy of SharedHotStandbyActive variable. False actually means "not
 193  * known, need to check the shared state".
 194  */
 195 static bool LocalHotStandbyActive = false;
 196
 197 /*
 198  * Local state for XLogInsertAllowed():
 199  *              1: unconditionally allowed to insert XLOG
 200  *              0: unconditionally not allowed to insert XLOG
 201  *              -1: must check RecoveryInProgress(); disallow until it is false
 202  * Most processes start with -1 and transition to 1 after seeing that recovery
 203  * is not in progress.  But we can also force the value for special cases.
 204  * The coding in XLogInsertAllowed() depends on the first two of these states
 205  * being numerically the same as bool true and false.
 206  */
 207 static int      LocalXLogInsertAllowed = -1;
 208
 209 /*
 210  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 211  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 212  * currently recovering using offline XLOG archives. These variables are only
 213  * valid in the startup process.
 214  *
 215  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 216  * currently performing crash recovery using only XLOG files in pg_xlog, but
 217  * will switch to using offline XLOG archives as soon as we reach the end of
 218  * WAL in pg_xlog.
 219 */
 220 bool            ArchiveRecoveryRequested = false;
 221 bool            InArchiveRecovery = false;
 222
 223 /* Was the last xlog file restored from archive, or local? */
 224 static bool restoredFromArchive = false;
 225
 226 /* options taken from recovery.conf for archive recovery */
 227 char       *recoveryRestoreCommand = NULL;
 228 static char *recoveryEndCommand = NULL;
 229 static char *archiveCleanupCommand = NULL;
 230 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 231 static bool recoveryTargetInclusive = true;
 232 static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
 233 static TransactionId recoveryTargetXid;
 234 static TimestampTz recoveryTargetTime;
 235 static char *recoveryTargetName;
 236 static int      recovery_min_apply_delay = 0;
 237 static TimestampTz recoveryDelayUntilTime;
 238
 239 /* options taken from recovery.conf for XLOG streaming */
 240 static bool StandbyModeRequested = false;
 241 static char *PrimaryConnInfo = NULL;
 242 static char *PrimarySlotName = NULL;
 243 static char *TriggerFile = NULL;
 244
 245 /* are we currently in standby mode? */
 246 bool            StandbyMode = false;
 247
 248 /* whether request for fast promotion has been made yet */
 249 static bool fast_promote = false;
 250
 251 /*
 252  * if recoveryStopsBefore/After returns true, it saves information of the stop
 253  * point here
 254  */
 255 static TransactionId recoveryStopXid;
 256 static TimestampTz recoveryStopTime;
 257 static char recoveryStopName[MAXFNAMELEN];
 258 static bool recoveryStopAfter;
 259
 260 /*
 261  * During normal operation, the only timeline we care about is ThisTimeLineID.
 262  * During recovery, however, things are more complicated.  To simplify life
 263  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 264  * scan through the WAL history (that is, it is the line that was active when
 265  * the currently-scanned WAL record was generated).  We also need these
 266  * timeline values:
 267  *
 268  * recoveryTargetTLI: the desired timeline that we want to end in.
 269  *
 270  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 271  *
 272  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 273  * its known parents, newest first (so recoveryTargetTLI is always the
 274  * first list member).  Only these TLIs are expected to be seen in the WAL
 275  * segments we read, and indeed only these TLIs will be considered as
 276  * candidate WAL files to open at all.
 277  *
 278  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 279  * (This is not necessarily the same as ThisTimeLineID, because we could
 280  * be scanning data that was copied from an ancestor timeline when the current
 281  * file was created.)  During a sequential scan we do not allow this value
 282  * to decrease.
 283  */
 284 static TimeLineID recoveryTargetTLI;
 285 static bool recoveryTargetIsLatest = false;
 286 static List *expectedTLEs;
 287 static TimeLineID curFileTLI;
 288
 289 /*
 290  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 291  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 292  * end+1 of the last record, and is reset when we end a top-level transaction,
 293  * or start a new one; so it can be used to tell if the current transaction has
 294  * created any XLOG records.
 295  */
 296 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
 297
 298 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 299
 300 /*
 301  * RedoRecPtr is this backend's local copy of the REDO record pointer
 302  * (which is almost but not quite the same as a pointer to the most recent
 303  * CHECKPOINT record).  We update this from the shared-memory copy,
 304  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 305  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
 306  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 307  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 308  * InitXLOGAccess.
 309  */
 310 static XLogRecPtr RedoRecPtr;
 311
 312 /*
 313  * doPageWrites is this backend's local copy of (forcePageWrites ||
 314  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
 315  * a full-page image of a page need to be taken.
 316  */
 317 static bool doPageWrites;
 318
 319 /*
 320  * RedoStartLSN points to the checkpoint's REDO location which is specified
 321  * in a backup label file, backup history file or control file. In standby
 322  * mode, XLOG streaming usually starts from the position where an invalid
 323  * record was found. But if we fail to read even the initial checkpoint
 324  * record, we use the REDO location instead of the checkpoint location as
 325  * the start position of XLOG streaming. Otherwise we would have to jump
 326  * backwards to the REDO location after reading the checkpoint record,
 327  * because the REDO record can precede the checkpoint record.
 328  */
 329 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 330
 331 /*----------
 332  * Shared-memory data structures for XLOG control
 333  *
 334  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 335  * the log up to (all records before that point must be written or fsynced).
 336  * LogwrtResult indicates the byte positions we have already written/fsynced.
 337  * These structs are identical but are declared separately to indicate their
 338  * slightly different functions.
 339  *
 340  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 341  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 342  * this arrangement is that the value can be examined by code that already
 343  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 344  * to the shared variable, each backend has a private copy of LogwrtResult,
 345  * which is updated when convenient.
 346  *
 347  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 348  * (protected by info_lck), but we don't need to cache any copies of it.
 349  *
 350  * info_lck is only held long enough to read/update the protected variables,
 351  * so it's a plain spinlock.  The other locks are held longer (potentially
 352  * over I/O operations), so we use LWLocks for them.  These locks are:
 353  *
 354  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 355  * It is only held while initializing and changing the mapping.  If the
 356  * contents of the buffer being replaced haven't been written yet, the mapping
 357  * lock is released while the write is done, and reacquired afterwards.
 358  *
 359  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 360  * XLogFlush).
 361  *
 362  * ControlFileLock: must be held to read/update control file or create
 363  * new log file.
 364  *
 365  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 366  * only one checkpointer at a time; currently, with all checkpoints done by
 367  * the checkpointer, this is just pro forma).
 368  *
 369  *----------
 370  */
 371
 372 typedef struct XLogwrtRqst
 373 {
 374         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 375         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 376 } XLogwrtRqst;
 377
 378 typedef struct XLogwrtResult
 379 {
 380         XLogRecPtr      Write;                  /* last byte + 1 written out */
 381         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 382 } XLogwrtResult;
 383
 384 /*
 385  * Inserting to WAL is protected by a small fixed number of WAL insertion
 386  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
 387  * matter which one. To lock out other concurrent insertions, you must hold
 388  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
 389  * indicator of how far the insertion has progressed (insertingAt).
 390  *
 391  * The insertingAt values are read when a process wants to flush WAL from
 392  * the in-memory buffers to disk, to check that all the insertions to the
 393  * region the process is about to write out have finished. You could simply
 394  * wait for all currently in-progress insertions to finish, but the
 395  * insertingAt indicator allows you to ignore insertions to later in the WAL,
 396  * so that you only wait for the insertions that are modifying the buffers
 397  * you're about to write out.
 398  *
 399  * This isn't just an optimization. If all the WAL buffers are dirty, an
 400  * inserter that's holding a WAL insert lock might need to evict an old WAL
 401  * buffer, which requires flushing the WAL. If it's possible for an inserter
 402  * to block on another inserter unnecessarily, deadlock can arise when two
 403  * inserters holding a WAL insert lock wait for each other to finish their
 404  * insertion.
 405  *
 406  * Small WAL records that don't cross a page boundary never update the value,
 407  * the WAL record is just copied to the page and the lock is released. But
 408  * to avoid the deadlock-scenario explained above, the indicator is always
 409  * updated before sleeping while holding an insertion lock.
 410  */
 411 typedef struct
 412 {
 413         LWLock          lock;
 414         XLogRecPtr      insertingAt;
 415 } WALInsertLock;
 416
 417 /*
 418  * All the WAL insertion locks are allocated as an array in shared memory. We
 419  * force the array stride to be a power of 2, which saves a few cycles in
 420  * indexing, but more importantly also ensures that individual slots don't
 421  * cross cache line boundaries. (Of course, we have to also ensure that the
 422  * array start address is suitably aligned.)
 423  */
 424 typedef union WALInsertLockPadded
 425 {
 426         WALInsertLock l;
 427         char            pad[PG_CACHE_LINE_SIZE];
 428 } WALInsertLockPadded;
 429
 430 /*
 431  * Shared state data for WAL insertion.
 432  */
 433 typedef struct XLogCtlInsert
 434 {
 435         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 436
 437         /*
 438          * CurrBytePos is the end of reserved WAL. The next record will be
 439          * inserted at that position. PrevBytePos is the start position of the
 440          * previously inserted (or rather, reserved) record - it is copied to the
 441          * prev-link of the next record. These are stored as "usable byte
 442          * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
 443          */
 444         uint64          CurrBytePos;
 445         uint64          PrevBytePos;
 446
 447         /*
 448          * Make sure the above heavily-contended spinlock and byte positions are
 449          * on their own cache line. In particular, the RedoRecPtr and full page
 450          * write variables below should be on a different cache line. They are
 451          * read on every WAL insertion, but updated rarely, and we don't want
 452          * those reads to steal the cache line containing Curr/PrevBytePos.
 453          */
 454         char            pad[PG_CACHE_LINE_SIZE];
 455
 456         /*
 457          * fullPageWrites is the master copy used by all backends to determine
 458          * whether to write full-page to WAL, instead of using process-local one.
 459          * This is required because, when full_page_writes is changed by SIGHUP,
 460          * we must WAL-log it before it actually affects WAL-logging by backends.
 461          * Checkpointer sets at startup or after SIGHUP.
 462          *
 463          * To read these fields, you must hold an insertion lock. To modify them,
 464          * you must hold ALL the locks.
 465          */
 466         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 467         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 468         bool            fullPageWrites;
 469
 470         /*
 471          * exclusiveBackup is true if a backup started with pg_start_backup() is
 472          * in progress, and nonExclusiveBackups is a counter indicating the number
 473          * of streaming base backups currently in progress. forcePageWrites is set
 474          * to true when either of these is non-zero. lastBackupStart is the latest
 475          * checkpoint redo location used as a starting point for an online backup.
 476          */
 477         bool            exclusiveBackup;
 478         int                     nonExclusiveBackups;
 479         XLogRecPtr      lastBackupStart;
 480
 481         /*
 482          * WAL insertion locks.
 483          */
 484         WALInsertLockPadded *WALInsertLocks;
 485         LWLockTranche WALInsertLockTranche;
 486         int                     WALInsertLockTrancheId;
 487 } XLogCtlInsert;
 488
 489 /*
 490  * Total shared-memory state for XLOG.
 491  */
 492 typedef struct XLogCtlData
 493 {
 494         XLogCtlInsert Insert;
 495
 496         /* Protected by info_lck: */
 497         XLogwrtRqst LogwrtRqst;
 498         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 499         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 500         TransactionId ckptXid;
 501         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 502         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
 503
 504         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
 505                                                                                  * segment */
 506
 507         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
 508         XLogRecPtr      unloggedLSN;
 509         slock_t         ulsn_lck;
 510
 511         /* Time of last xlog segment switch. Protected by WALWriteLock. */
 512         pg_time_t       lastSegSwitchTime;
 513
 514         /*
 515          * Protected by info_lck and WALWriteLock (you must hold either lock to
 516          * read it, but both to update)
 517          */
 518         XLogwrtResult LogwrtResult;
 519
 520         /*
 521          * Latest initialized page in the cache (last byte position + 1).
 522          *
 523          * To change the identity of a buffer (and InitializedUpTo), you need to
 524          * hold WALBufMappingLock.  To change the identity of a buffer that's
 525          * still dirty, the old page needs to be written out first, and for that
 526          * you need WALWriteLock, and you need to ensure that there are no
 527          * in-progress insertions to the page by calling
 528          * WaitXLogInsertionsToFinish().
 529          */
 530         XLogRecPtr      InitializedUpTo;
 531
 532         /*
 533          * These values do not change after startup, although the pointed-to pages
 534          * and xlblocks values certainly do.  xlblock values are protected by
 535          * WALBufMappingLock.
 536          */
 537         char       *pages;                      /* buffers for unwritten XLOG pages */
 538         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 539         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 540
 541         /*
 542          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 543          * If we created a new timeline when the system was started up,
 544          * PrevTimeLineID is the old timeline's ID that we forked off from.
 545          * Otherwise it's equal to ThisTimeLineID.
 546          */
 547         TimeLineID      ThisTimeLineID;
 548         TimeLineID      PrevTimeLineID;
 549
 550         /*
 551          * archiveCleanupCommand is read from recovery.conf but needs to be in
 552          * shared memory so that the checkpointer process can access it.
 553          */
 554         char            archiveCleanupCommand[MAXPGPATH];
 555
 556         /*
 557          * SharedRecoveryInProgress indicates if we're still in crash or archive
 558          * recovery.  Protected by info_lck.
 559          */
 560         bool            SharedRecoveryInProgress;
 561
 562         /*
 563          * SharedHotStandbyActive indicates if we're still in crash or archive
 564          * recovery.  Protected by info_lck.
 565          */
 566         bool            SharedHotStandbyActive;
 567
 568         /*
 569          * WalWriterSleeping indicates whether the WAL writer is currently in
 570          * low-power mode (and hence should be nudged if an async commit occurs).
 571          * Protected by info_lck.
 572          */
 573         bool            WalWriterSleeping;
 574
 575         /*
 576          * recoveryWakeupLatch is used to wake up the startup process to continue
 577          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 578          * to appear.
 579          */
 580         Latch           recoveryWakeupLatch;
 581
 582         /*
 583          * During recovery, we keep a copy of the latest checkpoint record here.
 584          * Used by the background writer when it wants to create a restartpoint.
 585          *
 586          * Protected by info_lck.
 587          */
 588         XLogRecPtr      lastCheckPointRecPtr;
 589         CheckPoint      lastCheckPoint;
 590
 591         /*
 592          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 593          * replayed. When we're currently replaying a record, ie. in a redo
 594          * function, replayEndRecPtr points to the end+1 of the record being
 595          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 596          */
 597         XLogRecPtr      lastReplayedEndRecPtr;
 598         TimeLineID      lastReplayedTLI;
 599         XLogRecPtr      replayEndRecPtr;
 600         TimeLineID      replayEndTLI;
 601         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 602         TimestampTz recoveryLastXTime;
 603         /* current effective recovery target timeline */
 604         TimeLineID      RecoveryTargetTLI;
 605
 606         /*
 607          * timestamp of when we started replaying the current chunk of WAL data,
 608          * only relevant for replication or archive recovery
 609          */
 610         TimestampTz currentChunkStartTime;
 611         /* Are we requested to pause recovery? */
 612         bool            recoveryPause;
 613
 614         /*
 615          * lastFpwDisableRecPtr points to the start of the last replayed
 616          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 617          */
 618         XLogRecPtr      lastFpwDisableRecPtr;
 619
 620         slock_t         info_lck;               /* locks shared variables shown above */
 621 } XLogCtlData;
 622
 623 static XLogCtlData *XLogCtl = NULL;
 624
 625 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
 626 static WALInsertLockPadded *WALInsertLocks = NULL;
 627
 628 /*
 629  * We maintain an image of pg_control in shared memory.
 630  */
 631 static ControlFileData *ControlFile = NULL;
 632
 633 /*
 634  * Calculate the amount of space left on the page after 'endptr'. Beware
 635  * multiple evaluation!
 636  */
 637 #define INSERT_FREESPACE(endptr)        \
 638         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 639
 640 /* Macro to advance to next buffer index. */
 641 #define NextBufIdx(idx)         \
 642                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 643
 644 /*
 645  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 646  * would hold if it was in cache, the page containing 'recptr'.
 647  */
 648 #define XLogRecPtrToBufIdx(recptr)      \
 649         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 650
 651 /*
 652  * These are the number of bytes in a WAL page and segment usable for WAL data.
 653  */
 654 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 655 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
 656
 657 /*
 658  * Private, possibly out-of-date copy of shared LogwrtResult.
 659  * See discussion above.
 660  */
 661 static XLogwrtResult LogwrtResult = {0, 0};
 662
 663 /*
 664  * Codes indicating where we got a WAL file from during recovery, or where
 665  * to attempt to get one.
 666  */
 667 typedef enum
 668 {
 669         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 670         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 671         XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
 672         XLOG_FROM_STREAM                        /* streamed from master */
 673 } XLogSource;
 674
 675 /* human-readable names for XLogSources, for debugging output */
 676 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
 677
 678 /*
 679  * openLogFile is -1 or a kernel FD for an open log file segment.
 680  * When it's open, openLogOff is the current seek offset in the file.
 681  * openLogSegNo identifies the segment.  These variables are only
 682  * used to write the XLOG, and so will normally refer to the active segment.
 683  */
 684 static int      openLogFile = -1;
 685 static XLogSegNo openLogSegNo = 0;
 686 static uint32 openLogOff = 0;
 687
 688 /*
 689  * These variables are used similarly to the ones above, but for reading
 690  * the XLOG.  Note, however, that readOff generally represents the offset
 691  * of the page just read, not the seek position of the FD itself, which
 692  * will be just past that page. readLen indicates how much of the current
 693  * page has been read into readBuf, and readSource indicates where we got
 694  * the currently open file from.
 695  */
 696 static int      readFile = -1;
 697 static XLogSegNo readSegNo = 0;
 698 static uint32 readOff = 0;
 699 static uint32 readLen = 0;
 700 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
 701
 702 /*
 703  * Keeps track of which source we're currently reading from. This is
 704  * different from readSource in that this is always set, even when we don't
 705  * currently have a WAL file open. If lastSourceFailed is set, our last
 706  * attempt to read from currentSource failed, and we should try another source
 707  * next.
 708  */
 709 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 710 static bool lastSourceFailed = false;
 711
 712 typedef struct XLogPageReadPrivate
 713 {
 714         int                     emode;
 715         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 716         bool            randAccess;
 717 } XLogPageReadPrivate;
 718
 719 /*
 720  * These variables track when we last obtained some WAL data to process,
 721  * and where we got it from.  (XLogReceiptSource is initially the same as
 722  * readSource, but readSource gets reset to zero when we don't have data
 723  * to process right now.  It is also different from currentSource, which
 724  * also changes when we try to read from a source and fail, while
 725  * XLogReceiptSource tracks where we last successfully read some WAL.)
 726  */
 727 static TimestampTz XLogReceiptTime = 0;
 728 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
 729
 730 /* State information for XLOG reading */
 731 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 732 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 733
 734 static XLogRecPtr minRecoveryPoint;             /* local copy of
 735                                                                                  * ControlFile->minRecoveryPoint */
 736 static TimeLineID minRecoveryPointTLI;
 737 static bool updateMinRecoveryPoint = true;
 738
 739 /*
 740  * Have we reached a consistent database state? In crash recovery, we have
 741  * to replay all the WAL, so reachedConsistency is never set. During archive
 742  * recovery, the database is consistent once minRecoveryPoint is reached.
 743  */
 744 bool            reachedConsistency = false;
 745
 746 static bool InRedo = false;
 747
 748 /* Have we launched bgwriter during recovery? */
 749 static bool bgwriterLaunched = false;
 750
 751 /* For WALInsertLockAcquire/Release functions */
 752 static int      MyLockNo = 0;
 753 static bool holdingAllLocks = false;
 754
 755 #ifdef WAL_DEBUG
 756 static MemoryContext walDebugCxt = NULL;
 757 #endif
 758
 759 static void readRecoveryCommandFile(void);
 760 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
 761 static bool recoveryStopsBefore(XLogReaderState *record);
 762 static bool recoveryStopsAfter(XLogReaderState *record);
 763 static void recoveryPausesHere(void);
 764 static bool recoveryApplyDelay(XLogReaderState *record);
 765 static void SetLatestXTime(TimestampTz xtime);
 766 static void SetCurrentChunkStartTime(TimestampTz xtime);
 767 static void CheckRequiredParameterValues(void);
 768 static void XLogReportParameters(void);
 769 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 770                                         TimeLineID prevTLI);
 771 static void LocalSetXLogInsertAllowed(void);
 772 static void CreateEndOfRecoveryRecord(void);
 773 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 774 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 775 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 776
 777 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 778 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 779 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 780 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 781                                            bool find_free, int *max_advance,
 782                                            bool use_lock);
 783 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 784                          int source, bool notexistOk);
 785 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 786 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 787                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 788                          TimeLineID *readTLI);
 789 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 790                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 791 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 792 static void XLogFileClose(void);
 793 static void PreallocXlogFiles(XLogRecPtr endptr);
 794 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
 795 static void UpdateLastRemovedPtr(char *filename);
 796 static void ValidateXLOGDirectoryStructure(void);
 797 static void CleanupBackupHistory(void);
 798 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 799 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 800                    int emode, bool fetching_ckpt);
 801 static void CheckRecoveryConsistency(void);
 802 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 803                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 804 static bool rescanLatestTimeLine(void);
 805 static void WriteControlFile(void);
 806 static void ReadControlFile(void);
 807 static char *str_time(pg_time_t tnow);
 808 static bool CheckForStandbyTrigger(void);
 809
 810 #ifdef WAL_DEBUG
 811 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
 812 #endif
 813 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 814 static void pg_start_backup_callback(int code, Datum arg);
 815 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 816                                   bool *backupEndRequired, bool *backupFromStandby);
 817 static void rm_redo_error_callback(void *arg);
 818 static int      get_sync_bit(int method);
 819
 820 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 821                                         XLogRecData *rdata,
 822                                         XLogRecPtr StartPos, XLogRecPtr EndPos);
 823 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 824                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 825 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 826                                   XLogRecPtr *PrevPtr);
 827 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 828 static char *GetXLogBuffer(XLogRecPtr ptr);
 829 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 830 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 831 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 832
 833 static void WALInsertLockAcquire(void);
 834 static void WALInsertLockAcquireExclusive(void);
 835 static void WALInsertLockRelease(void);
 836 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 837
 838 /*
 839  * Insert an XLOG record represented by an already-constructed chain of data
 840  * chunks.  This is a low-level routine; to construct the WAL record header
 841  * and data, use the higher-level routines in xloginsert.c.
 842  *
 843  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
 844  * WAL record applies to, that were not included in the record as full page
 845  * images.  If fpw_lsn >= RedoRecPtr, the function does not perform the
 846  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
 847  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
 848  * record is always inserted.
 849  *
 850  * The first XLogRecData in the chain must be for the record header, and its
 851  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
 852  * xl_crc fields in the header, the rest of the header must already be filled
 853  * by the caller.
 854  *
 855  * Returns XLOG pointer to end of record (beginning of next record).
 856  * This can be used as LSN for data pages affected by the logged action.
 857  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 858  * before the data page can be written out.  This implements the basic
 859  * WAL rule "write the log before the data".)
 860  */
 861 XLogRecPtr
 862 XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn)
 863 {
 864         XLogCtlInsert *Insert = &XLogCtl->Insert;
 865         pg_crc32        rdata_crc;
 866         bool            inserted;
 867         XLogRecord *rechdr = (XLogRecord *) rdata->data;
 868         bool            isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
 869                                                            rechdr->xl_info == XLOG_SWITCH);
 870         XLogRecPtr      StartPos;
 871         XLogRecPtr      EndPos;
 872
 873         /* we assume that all of the record header is in the first chunk */
 874         Assert(rdata->len >= SizeOfXLogRecord);
 875
 876         /* cross-check on whether we should be here or not */
 877         if (!XLogInsertAllowed())
 878                 elog(ERROR, "cannot make new WAL entries during recovery");
 879
 880         /*----------
 881          *
 882          * We have now done all the preparatory work we can without holding a
 883          * lock or modifying shared state. From here on, inserting the new WAL
 884          * record to the shared WAL buffer cache is a two-step process:
 885          *
 886          * 1. Reserve the right amount of space from the WAL. The current head of
 887          *        reserved space is kept in Insert->CurrBytePos, and is protected by
 888          *        insertpos_lck.
 889          *
 890          * 2. Copy the record to the reserved WAL space. This involves finding the
 891          *        correct WAL buffer containing the reserved space, and copying the
 892          *        record in place. This can be done concurrently in multiple processes.
 893          *
 894          * To keep track of which insertions are still in-progress, each concurrent
 895          * inserter acquires an insertion lock. In addition to just indicating that
 896          * an insertion is in progress, the lock tells others how far the inserter
 897          * has progressed. There is a small fixed number of insertion locks,
 898          * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
 899          * boundary, it updates the value stored in the lock to the how far it has
 900          * inserted, to allow the previous buffer to be flushed.
 901          *
 902          * Holding onto an insertion lock also protects RedoRecPtr and
 903          * fullPageWrites from changing until the insertion is finished.
 904          *
 905          * Step 2 can usually be done completely in parallel. If the required WAL
 906          * page is not initialized yet, you have to grab WALBufMappingLock to
 907          * initialize it, but the WAL writer tries to do that ahead of insertions
 908          * to avoid that from happening in the critical path.
 909          *
 910          *----------
 911          */
 912         START_CRIT_SECTION();
 913         if (isLogSwitch)
 914                 WALInsertLockAcquireExclusive();
 915         else
 916                 WALInsertLockAcquire();
 917
 918         /*
 919          * Check to see if my copy of RedoRecPtr or doPageWrites is out of date.
 920          * If so, may have to go back and have the caller recompute everything.
 921          * This can only happen just after a checkpoint, so it's better to be
 922          * slow in this case and fast otherwise.
 923          *
 924          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 925          * affect the contents of the XLOG record, so we'll update our local copy
 926          * but not force a recomputation.  (If doPageWrites was just turned off,
 927          * we could recompute the record without full pages, but we choose not
 928          * to bother.)
 929          */
 930         if (RedoRecPtr != Insert->RedoRecPtr)
 931         {
 932                 Assert(RedoRecPtr < Insert->RedoRecPtr);
 933                 RedoRecPtr = Insert->RedoRecPtr;
 934         }
 935         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
 936
 937         if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites)
 938         {
 939                 /*
 940                  * Oops, some buffer now needs to be backed up that the caller
 941                  * didn't back up.  Start over.
 942                  */
 943                 WALInsertLockRelease();
 944                 END_CRIT_SECTION();
 945                 return InvalidXLogRecPtr;
 946         }
 947
 948         /*
 949          * Reserve space for the record in the WAL. This also sets the xl_prev
 950          * pointer.
 951          */
 952         if (isLogSwitch)
 953                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
 954         else
 955         {
 956                 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
 957                                                                   &rechdr->xl_prev);
 958                 inserted = true;
 959         }
 960
 961         if (inserted)
 962         {
 963                 /*
 964                  * Now that xl_prev has been filled in, calculate CRC of the record
 965                  * header.
 966                  */
 967                 rdata_crc = rechdr->xl_crc;
 968                 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
 969                 FIN_CRC32C(rdata_crc);
 970                 rechdr->xl_crc = rdata_crc;
 971
 972                 /*
 973                  * All the record data, including the header, is now ready to be
 974                  * inserted. Copy the record in the space reserved.
 975                  */
 976                 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
 977                                                         StartPos, EndPos);
 978         }
 979         else
 980         {
 981                 /*
 982                  * This was an xlog-switch record, but the current insert location was
 983                  * already exactly at the beginning of a segment, so there was no need
 984                  * to do anything.
 985                  */
 986         }
 987
 988         /*
 989          * Done! Let others know that we're finished.
 990          */
 991         WALInsertLockRelease();
 992
 993         MarkCurrentTransactionIdLoggedIfAny();
 994
 995         END_CRIT_SECTION();
 996
 997         /*
 998          * Update shared LogwrtRqst.Write, if we crossed page boundary.
 999          */
1000         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1001         {
1002                 SpinLockAcquire(&XLogCtl->info_lck);
1003                 /* advance global request to include new block(s) */
1004                 if (XLogCtl->LogwrtRqst.Write < EndPos)
1005                         XLogCtl->LogwrtRqst.Write = EndPos;
1006                 /* update local result copy while I have the chance */
1007                 LogwrtResult = XLogCtl->LogwrtResult;
1008                 SpinLockRelease(&XLogCtl->info_lck);
1009         }
1010
1011         /*
1012          * If this was an XLOG_SWITCH record, flush the record and the empty
1013          * padding space that fills the rest of the segment, and perform
1014          * end-of-segment actions (eg, notifying archiver).
1015          */
1016         if (isLogSwitch)
1017         {
1018                 TRACE_POSTGRESQL_XLOG_SWITCH();
1019                 XLogFlush(EndPos);
1020
1021                 /*
1022                  * Even though we reserved the rest of the segment for us, which is
1023                  * reflected in EndPos, we return a pointer to just the end of the
1024                  * xlog-switch record.
1025                  */
1026                 if (inserted)
1027                 {
1028                         EndPos = StartPos + SizeOfXLogRecord;
1029                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1030                         {
1031                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1032                                         EndPos += SizeOfXLogLongPHD;
1033                                 else
1034                                         EndPos += SizeOfXLogShortPHD;
1035                         }
1036                 }
1037         }
1038
1039 #ifdef WAL_DEBUG
1040         if (XLOG_DEBUG)
1041         {
1042                 static XLogReaderState *debug_reader = NULL;
1043                 StringInfoData buf;
1044                 StringInfoData recordBuf;
1045                 char       *errormsg = NULL;
1046                 MemoryContext oldCxt;
1047
1048                 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1049
1050                 initStringInfo(&buf);
1051                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1052                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1053
1054                 /*
1055                  * We have to piece together the WAL record data from the XLogRecData
1056                  * entries, so that we can pass it to the rm_desc function as one
1057                  * contiguous chunk.
1058                  */
1059                 initStringInfo(&recordBuf);
1060                 for (; rdata != NULL; rdata = rdata->next)
1061                         appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1062
1063                 if (!debug_reader)
1064                         debug_reader = XLogReaderAllocate(NULL, NULL);
1065
1066                 if (!debug_reader ||
1067                         !DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1068                                                           &errormsg))
1069                 {
1070                         appendStringInfo(&buf, "error decoding record: %s",
1071                                                          errormsg ? errormsg : "no error message");
1072                 }
1073                 else
1074                 {
1075                         appendStringInfoString(&buf, " - ");
1076                         xlog_outdesc(&buf, debug_reader);
1077                 }
1078                 elog(LOG, "%s", buf.data);
1079
1080                 pfree(buf.data);
1081                 pfree(recordBuf.data);
1082                 MemoryContextSwitchTo(oldCxt);
1083         }
1084 #endif
1085
1086         /*
1087          * Update our global variables
1088          */
1089         ProcLastRecPtr = StartPos;
1090         XactLastRecEnd = EndPos;
1091
1092         return EndPos;
1093 }
1094
1095 /*
1096  * Reserves the right amount of space for a record of given size from the WAL.
1097  * *StartPos is set to the beginning of the reserved section, *EndPos to
1098  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1099  * used to set the xl_prev of this record.
1100  *
1101  * This is the performance critical part of XLogInsert that must be serialized
1102  * across backends. The rest can happen mostly in parallel. Try to keep this
1103  * section as short as possible, insertpos_lck can be heavily contended on a
1104  * busy system.
1105  *
1106  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1107  * where we actually copy the record to the reserved space.
1108  */
1109 static void
1110 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1111                                                   XLogRecPtr *PrevPtr)
1112 {
1113         XLogCtlInsert *Insert = &XLogCtl->Insert;
1114         uint64          startbytepos;
1115         uint64          endbytepos;
1116         uint64          prevbytepos;
1117
1118         size = MAXALIGN(size);
1119
1120         /* All (non xlog-switch) records should contain data. */
1121         Assert(size > SizeOfXLogRecord);
1122
1123         /*
1124          * The duration the spinlock needs to be held is minimized by minimizing
1125          * the calculations that have to be done while holding the lock. The
1126          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1127          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1128          * page headers. The mapping between "usable" byte positions and physical
1129          * positions (XLogRecPtrs) can be done outside the locked region, and
1130          * because the usable byte position doesn't include any headers, reserving
1131          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1132          */
1133         SpinLockAcquire(&Insert->insertpos_lck);
1134
1135         startbytepos = Insert->CurrBytePos;
1136         endbytepos = startbytepos + size;
1137         prevbytepos = Insert->PrevBytePos;
1138         Insert->CurrBytePos = endbytepos;
1139         Insert->PrevBytePos = startbytepos;
1140
1141         SpinLockRelease(&Insert->insertpos_lck);
1142
1143         *StartPos = XLogBytePosToRecPtr(startbytepos);
1144         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1145         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1146
1147         /*
1148          * Check that the conversions between "usable byte positions" and
1149          * XLogRecPtrs work consistently in both directions.
1150          */
1151         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1152         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1153         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1154 }
1155
1156 /*
1157  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1158  *
1159  * A log-switch record is handled slightly differently. The rest of the
1160  * segment will be reserved for this insertion, as indicated by the returned
1161  * *EndPos value. However, if we are already at the beginning of the current
1162  * segment, *StartPos and *EndPos are set to the current location without
1163  * reserving any space, and the function returns false.
1164 */
1165 static bool
1166 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1167 {
1168         XLogCtlInsert *Insert = &XLogCtl->Insert;
1169         uint64          startbytepos;
1170         uint64          endbytepos;
1171         uint64          prevbytepos;
1172         uint32          size = MAXALIGN(SizeOfXLogRecord);
1173         XLogRecPtr      ptr;
1174         uint32          segleft;
1175
1176         /*
1177          * These calculations are a bit heavy-weight to be done while holding a
1178          * spinlock, but since we're holding all the WAL insertion locks, there
1179          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1180          * compete for it, but that's not called very frequently.
1181          */
1182         SpinLockAcquire(&Insert->insertpos_lck);
1183
1184         startbytepos = Insert->CurrBytePos;
1185
1186         ptr = XLogBytePosToEndRecPtr(startbytepos);
1187         if (ptr % XLOG_SEG_SIZE == 0)
1188         {
1189                 SpinLockRelease(&Insert->insertpos_lck);
1190                 *EndPos = *StartPos = ptr;
1191                 return false;
1192         }
1193
1194         endbytepos = startbytepos + size;
1195         prevbytepos = Insert->PrevBytePos;
1196
1197         *StartPos = XLogBytePosToRecPtr(startbytepos);
1198         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1199
1200         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1201         if (segleft != XLOG_SEG_SIZE)
1202         {
1203                 /* consume the rest of the segment */
1204                 *EndPos += segleft;
1205                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1206         }
1207         Insert->CurrBytePos = endbytepos;
1208         Insert->PrevBytePos = startbytepos;
1209
1210         SpinLockRelease(&Insert->insertpos_lck);
1211
1212         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1213
1214         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1215         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1216         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1217         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1218
1219         return true;
1220 }
1221
1222 /*
1223  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1224  * area in the WAL.
1225  */
1226 static void
1227 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1228                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1229 {
1230         char       *currpos;
1231         int                     freespace;
1232         int                     written;
1233         XLogRecPtr      CurrPos;
1234         XLogPageHeader pagehdr;
1235
1236         /*
1237          * Get a pointer to the right place in the right WAL buffer to start
1238          * inserting to.
1239          */
1240         CurrPos = StartPos;
1241         currpos = GetXLogBuffer(CurrPos);
1242         freespace = INSERT_FREESPACE(CurrPos);
1243
1244         /*
1245          * there should be enough space for at least the first field (xl_tot_len)
1246          * on this page.
1247          */
1248         Assert(freespace >= sizeof(uint32));
1249
1250         /* Copy record data */
1251         written = 0;
1252         while (rdata != NULL)
1253         {
1254                 char       *rdata_data = rdata->data;
1255                 int                     rdata_len = rdata->len;
1256
1257                 while (rdata_len > freespace)
1258                 {
1259                         /*
1260                          * Write what fits on this page, and continue on the next page.
1261                          */
1262                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1263                         memcpy(currpos, rdata_data, freespace);
1264                         rdata_data += freespace;
1265                         rdata_len -= freespace;
1266                         written += freespace;
1267                         CurrPos += freespace;
1268
1269                         /*
1270                          * Get pointer to beginning of next page, and set the xlp_rem_len
1271                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1272                          *
1273                          * It's safe to set the contrecord flag and xlp_rem_len without a
1274                          * lock on the page. All the other flags were already set when the
1275                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1276                          * only backend that needs to set the contrecord flag.
1277                          */
1278                         currpos = GetXLogBuffer(CurrPos);
1279                         pagehdr = (XLogPageHeader) currpos;
1280                         pagehdr->xlp_rem_len = write_len - written;
1281                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1282
1283                         /* skip over the page header */
1284                         if (CurrPos % XLogSegSize == 0)
1285                         {
1286                                 CurrPos += SizeOfXLogLongPHD;
1287                                 currpos += SizeOfXLogLongPHD;
1288                         }
1289                         else
1290                         {
1291                                 CurrPos += SizeOfXLogShortPHD;
1292                                 currpos += SizeOfXLogShortPHD;
1293                         }
1294                         freespace = INSERT_FREESPACE(CurrPos);
1295                 }
1296
1297                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1298                 memcpy(currpos, rdata_data, rdata_len);
1299                 currpos += rdata_len;
1300                 CurrPos += rdata_len;
1301                 freespace -= rdata_len;
1302                 written += rdata_len;
1303
1304                 rdata = rdata->next;
1305         }
1306         Assert(written == write_len);
1307
1308         /*
1309          * If this was an xlog-switch, it's not enough to write the switch record,
1310          * we also have to consume all the remaining space in the WAL segment. We
1311          * have already reserved it for us, but we still need to make sure it's
1312          * allocated and zeroed in the WAL buffers so that when the caller (or
1313          * someone else) does XLogWrite(), it can really write out all the zeros.
1314          */
1315         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1316         {
1317                 /* An xlog-switch record doesn't contain any data besides the header */
1318                 Assert(write_len == SizeOfXLogRecord);
1319
1320                 /*
1321                  * We do this one page at a time, to make sure we don't deadlock
1322                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1323                  */
1324                 Assert(EndPos % XLogSegSize == 0);
1325
1326                 /* Use up all the remaining space on the first page */
1327                 CurrPos += freespace;
1328
1329                 while (CurrPos < EndPos)
1330                 {
1331                         /* initialize the next page (if not initialized already) */
1332                         WALInsertLockUpdateInsertingAt(CurrPos);
1333                         AdvanceXLInsertBuffer(CurrPos, false);
1334                         CurrPos += XLOG_BLCKSZ;
1335                 }
1336         }
1337         else
1338         {
1339                 /* Align the end position, so that the next record starts aligned */
1340                 CurrPos = MAXALIGN64(CurrPos);
1341         }
1342
1343         if (CurrPos != EndPos)
1344                 elog(PANIC, "space reserved for WAL record does not match what was written");
1345 }
1346
1347 /*
1348  * Acquire a WAL insertion lock, for inserting to WAL.
1349  */
1350 static void
1351 WALInsertLockAcquire(void)
1352 {
1353         bool            immed;
1354
1355         /*
1356          * It doesn't matter which of the WAL insertion locks we acquire, so try
1357          * the one we used last time.  If the system isn't particularly busy, it's
1358          * a good bet that it's still available, and it's good to have some
1359          * affinity to a particular lock so that you don't unnecessarily bounce
1360          * cache lines between processes when there's no contention.
1361          *
1362          * If this is the first time through in this backend, pick a lock
1363          * (semi-)randomly.  This allows the locks to be used evenly if you have a
1364          * lot of very short connections.
1365          */
1366         static int      lockToTry = -1;
1367
1368         if (lockToTry == -1)
1369                 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1370         MyLockNo = lockToTry;
1371
1372         /*
1373          * The insertingAt value is initially set to 0, as we don't know our
1374          * insert location yet.
1375          */
1376         immed = LWLockAcquireWithVar(&WALInsertLocks[MyLockNo].l.lock,
1377                                                                  &WALInsertLocks[MyLockNo].l.insertingAt,
1378                                                                  0);
1379         if (!immed)
1380         {
1381                 /*
1382                  * If we couldn't get the lock immediately, try another lock next
1383                  * time.  On a system with more insertion locks than concurrent
1384                  * inserters, this causes all the inserters to eventually migrate to a
1385                  * lock that no-one else is using.  On a system with more inserters
1386                  * than locks, it still helps to distribute the inserters evenly
1387                  * across the locks.
1388                  */
1389                 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1390         }
1391 }
1392
1393 /*
1394  * Acquire all WAL insertion locks, to prevent other backends from inserting
1395  * to WAL.
1396  */
1397 static void
1398 WALInsertLockAcquireExclusive(void)
1399 {
1400         int                     i;
1401
1402         /*
1403          * When holding all the locks, we only update the last lock's insertingAt
1404          * indicator.  The others are set to 0xFFFFFFFFFFFFFFFF, which is higher
1405          * than any real XLogRecPtr value, to make sure that no-one blocks waiting
1406          * on those.
1407          */
1408         for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1409         {
1410                 LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
1411                                                          &WALInsertLocks[i].l.insertingAt,
1412                                                          UINT64CONST(0xFFFFFFFFFFFFFFFF));
1413         }
1414         LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
1415                                                  &WALInsertLocks[i].l.insertingAt,
1416                                                  0);
1417
1418         holdingAllLocks = true;
1419 }
1420
1421 /*
1422  * Release our insertion lock (or locks, if we're holding them all).
1423  */
1424 static void
1425 WALInsertLockRelease(void)
1426 {
1427         if (holdingAllLocks)
1428         {
1429                 int                     i;
1430
1431                 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1432                         LWLockRelease(&WALInsertLocks[i].l.lock);
1433
1434                 holdingAllLocks = false;
1435         }
1436         else
1437         {
1438                 LWLockRelease(&WALInsertLocks[MyLockNo].l.lock);
1439         }
1440 }
1441
1442 /*
1443  * Update our insertingAt value, to let others know that we've finished
1444  * inserting up to that point.
1445  */
1446 static void
1447 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1448 {
1449         if (holdingAllLocks)
1450         {
1451                 /*
1452                  * We use the last lock to mark our actual position, see comments in
1453                  * WALInsertLockAcquireExclusive.
1454                  */
1455                 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1456                                          &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1457                                                 insertingAt);
1458         }
1459         else
1460                 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1461                                                 &WALInsertLocks[MyLockNo].l.insertingAt,
1462                                                 insertingAt);
1463 }
1464
1465 /*
1466  * Wait for any WAL insertions < upto to finish.
1467  *
1468  * Returns the location of the oldest insertion that is still in-progress.
1469  * Any WAL prior to that point has been fully copied into WAL buffers, and
1470  * can be flushed out to disk. Because this waits for any insertions older
1471  * than 'upto' to finish, the return value is always >= 'upto'.
1472  *
1473  * Note: When you are about to write out WAL, you must call this function
1474  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1475  * need to wait for an insertion to finish (or at least advance to next
1476  * uninitialized page), and the inserter might need to evict an old WAL buffer
1477  * to make room for a new one, which in turn requires WALWriteLock.
1478  */
1479 static XLogRecPtr
1480 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1481 {
1482         uint64          bytepos;
1483         XLogRecPtr      reservedUpto;
1484         XLogRecPtr      finishedUpto;
1485         XLogCtlInsert *Insert = &XLogCtl->Insert;
1486         int                     i;
1487
1488         if (MyProc == NULL)
1489                 elog(PANIC, "cannot wait without a PGPROC structure");
1490
1491         /* Read the current insert position */
1492         SpinLockAcquire(&Insert->insertpos_lck);
1493         bytepos = Insert->CurrBytePos;
1494         SpinLockRelease(&Insert->insertpos_lck);
1495         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1496
1497         /*
1498          * No-one should request to flush a piece of WAL that hasn't even been
1499          * reserved yet. However, it can happen if there is a block with a bogus
1500          * LSN on disk, for example. XLogFlush checks for that situation and
1501          * complains, but only after the flush. Here we just assume that to mean
1502          * that all WAL that has been reserved needs to be finished. In this
1503          * corner-case, the return value can be smaller than 'upto' argument.
1504          */
1505         if (upto > reservedUpto)
1506         {
1507                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1508                          (uint32) (upto >> 32), (uint32) upto,
1509                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1510                 upto = reservedUpto;
1511         }
1512
1513         /*
1514          * Loop through all the locks, sleeping on any in-progress insert older
1515          * than 'upto'.
1516          *
1517          * finishedUpto is our return value, indicating the point upto which all
1518          * the WAL insertions have been finished. Initialize it to the head of
1519          * reserved WAL, and as we iterate through the insertion locks, back it
1520          * out for any insertion that's still in progress.
1521          */
1522         finishedUpto = reservedUpto;
1523         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1524         {
1525                 XLogRecPtr      insertingat = InvalidXLogRecPtr;
1526
1527                 do
1528                 {
1529                         /*
1530                          * See if this insertion is in progress. LWLockWait will wait for
1531                          * the lock to be released, or for the 'value' to be set by a
1532                          * LWLockUpdateVar call.  When a lock is initially acquired, its
1533                          * value is 0 (InvalidXLogRecPtr), which means that we don't know
1534                          * where it's inserting yet.  We will have to wait for it.  If
1535                          * it's a small insertion, the record will most likely fit on the
1536                          * same page and the inserter will release the lock without ever
1537                          * calling LWLockUpdateVar.  But if it has to sleep, it will
1538                          * advertise the insertion point with LWLockUpdateVar before
1539                          * sleeping.
1540                          */
1541                         if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1542                                                                  &WALInsertLocks[i].l.insertingAt,
1543                                                                  insertingat, &insertingat))
1544                         {
1545                                 /* the lock was free, so no insertion in progress */
1546                                 insertingat = InvalidXLogRecPtr;
1547                                 break;
1548                         }
1549
1550                         /*
1551                          * This insertion is still in progress. Have to wait, unless the
1552                          * inserter has proceeded past 'upto'.
1553                          */
1554                 } while (insertingat < upto);
1555
1556                 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1557                         finishedUpto = insertingat;
1558         }
1559         return finishedUpto;
1560 }
1561
1562 /*
1563  * Get a pointer to the right location in the WAL buffer containing the
1564  * given XLogRecPtr.
1565  *
1566  * If the page is not initialized yet, it is initialized. That might require
1567  * evicting an old dirty buffer from the buffer cache, which means I/O.
1568  *
1569  * The caller must ensure that the page containing the requested location
1570  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1571  * hold onto a WAL insertion lock with the insertingAt position set to
1572  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1573  * to evict an old page from the buffer. (This means that once you call
1574  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1575  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1576  * later, because older buffers might be recycled already)
1577  */
1578 static char *
1579 GetXLogBuffer(XLogRecPtr ptr)
1580 {
1581         int                     idx;
1582         XLogRecPtr      endptr;
1583         static uint64 cachedPage = 0;
1584         static char *cachedPos = NULL;
1585         XLogRecPtr      expectedEndPtr;
1586
1587         /*
1588          * Fast path for the common case that we need to access again the same
1589          * page as last time.
1590          */
1591         if (ptr / XLOG_BLCKSZ == cachedPage)
1592         {
1593                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1594                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1595                 return cachedPos + ptr % XLOG_BLCKSZ;
1596         }
1597
1598         /*
1599          * The XLog buffer cache is organized so that a page is always loaded to a
1600          * particular buffer.  That way we can easily calculate the buffer a given
1601          * page must be loaded into, from the XLogRecPtr alone.
1602          */
1603         idx = XLogRecPtrToBufIdx(ptr);
1604
1605         /*
1606          * See what page is loaded in the buffer at the moment. It could be the
1607          * page we're looking for, or something older. It can't be anything newer
1608          * - that would imply the page we're looking for has already been written
1609          * out to disk and evicted, and the caller is responsible for making sure
1610          * that doesn't happen.
1611          *
1612          * However, we don't hold a lock while we read the value. If someone has
1613          * just initialized the page, it's possible that we get a "torn read" of
1614          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1615          * that case we will see a bogus value. That's ok, we'll grab the mapping
1616          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1617          * the page we're looking for. But it means that when we do this unlocked
1618          * read, we might see a value that appears to be ahead of the page we're
1619          * looking for. Don't PANIC on that, until we've verified the value while
1620          * holding the lock.
1621          */
1622         expectedEndPtr = ptr;
1623         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1624
1625         endptr = XLogCtl->xlblocks[idx];
1626         if (expectedEndPtr != endptr)
1627         {
1628                 /*
1629                  * Let others know that we're finished inserting the record up to the
1630                  * page boundary.
1631                  */
1632                 WALInsertLockUpdateInsertingAt(expectedEndPtr - XLOG_BLCKSZ);
1633
1634                 AdvanceXLInsertBuffer(ptr, false);
1635                 endptr = XLogCtl->xlblocks[idx];
1636
1637                 if (expectedEndPtr != endptr)
1638                         elog(PANIC, "could not find WAL buffer for %X/%X",
1639                                  (uint32) (ptr >> 32), (uint32) ptr);
1640         }
1641         else
1642         {
1643                 /*
1644                  * Make sure the initialization of the page is visible to us, and
1645                  * won't arrive later to overwrite the WAL data we write on the page.
1646                  */
1647                 pg_memory_barrier();
1648         }
1649
1650         /*
1651          * Found the buffer holding this page. Return a pointer to the right
1652          * offset within the page.
1653          */
1654         cachedPage = ptr / XLOG_BLCKSZ;
1655         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1656
1657         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1658         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1659
1660         return cachedPos + ptr % XLOG_BLCKSZ;
1661 }
1662
1663 /*
1664  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1665  * is the position starting from the beginning of WAL, excluding all WAL
1666  * page headers.
1667  */
1668 static XLogRecPtr
1669 XLogBytePosToRecPtr(uint64 bytepos)
1670 {
1671         uint64          fullsegs;
1672         uint64          fullpages;
1673         uint64          bytesleft;
1674         uint32          seg_offset;
1675         XLogRecPtr      result;
1676
1677         fullsegs = bytepos / UsableBytesInSegment;
1678         bytesleft = bytepos % UsableBytesInSegment;
1679
1680         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1681         {
1682                 /* fits on first page of segment */
1683                 seg_offset = bytesleft + SizeOfXLogLongPHD;
1684         }
1685         else
1686         {
1687                 /* account for the first page on segment with long header */
1688                 seg_offset = XLOG_BLCKSZ;
1689                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1690
1691                 fullpages = bytesleft / UsableBytesInPage;
1692                 bytesleft = bytesleft % UsableBytesInPage;
1693
1694                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1695         }
1696
1697         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1698
1699         return result;
1700 }
1701
1702 /*
1703  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1704  * returns a pointer to the beginning of the page (ie. before page header),
1705  * not to where the first xlog record on that page would go to. This is used
1706  * when converting a pointer to the end of a record.
1707  */
1708 static XLogRecPtr
1709 XLogBytePosToEndRecPtr(uint64 bytepos)
1710 {
1711         uint64          fullsegs;
1712         uint64          fullpages;
1713         uint64          bytesleft;
1714         uint32          seg_offset;
1715         XLogRecPtr      result;
1716
1717         fullsegs = bytepos / UsableBytesInSegment;
1718         bytesleft = bytepos % UsableBytesInSegment;
1719
1720         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1721         {
1722                 /* fits on first page of segment */
1723                 if (bytesleft == 0)
1724                         seg_offset = 0;
1725                 else
1726                         seg_offset = bytesleft + SizeOfXLogLongPHD;
1727         }
1728         else
1729         {
1730                 /* account for the first page on segment with long header */
1731                 seg_offset = XLOG_BLCKSZ;
1732                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1733
1734                 fullpages = bytesleft / UsableBytesInPage;
1735                 bytesleft = bytesleft % UsableBytesInPage;
1736
1737                 if (bytesleft == 0)
1738                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1739                 else
1740                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1741         }
1742
1743         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1744
1745         return result;
1746 }
1747
1748 /*
1749  * Convert an XLogRecPtr to a "usable byte position".
1750  */
1751 static uint64
1752 XLogRecPtrToBytePos(XLogRecPtr ptr)
1753 {
1754         uint64          fullsegs;
1755         uint32          fullpages;
1756         uint32          offset;
1757         uint64          result;
1758
1759         XLByteToSeg(ptr, fullsegs);
1760
1761         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
1762         offset = ptr % XLOG_BLCKSZ;
1763
1764         if (fullpages == 0)
1765         {
1766                 result = fullsegs * UsableBytesInSegment;
1767                 if (offset > 0)
1768                 {
1769                         Assert(offset >= SizeOfXLogLongPHD);
1770                         result += offset - SizeOfXLogLongPHD;
1771                 }
1772         }
1773         else
1774         {
1775                 result = fullsegs * UsableBytesInSegment +
1776                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
1777                         (fullpages - 1) * UsableBytesInPage;            /* full pages */
1778                 if (offset > 0)
1779                 {
1780                         Assert(offset >= SizeOfXLogShortPHD);
1781                         result += offset - SizeOfXLogShortPHD;
1782                 }
1783         }
1784
1785         return result;
1786 }
1787
1788 /*
1789  * Initialize XLOG buffers, writing out old buffers if they still contain
1790  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
1791  * true, initialize as many pages as we can without having to write out
1792  * unwritten data. Any new pages are initialized to zeros, with pages headers
1793  * initialized properly.
1794  */
1795 static void
1796 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
1797 {
1798         XLogCtlInsert *Insert = &XLogCtl->Insert;
1799         int                     nextidx;
1800         XLogRecPtr      OldPageRqstPtr;
1801         XLogwrtRqst WriteRqst;
1802         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
1803         XLogRecPtr      NewPageBeginPtr;
1804         XLogPageHeader NewPage;
1805         int                     npages = 0;
1806
1807         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
1808
1809         /*
1810          * Now that we have the lock, check if someone initialized the page
1811          * already.
1812          */
1813         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
1814         {
1815                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
1816
1817                 /*
1818                  * Get ending-offset of the buffer page we need to replace (this may
1819                  * be zero if the buffer hasn't been used yet).  Fall through if it's
1820                  * already written out.
1821                  */
1822                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1823                 if (LogwrtResult.Write < OldPageRqstPtr)
1824                 {
1825                         /*
1826                          * Nope, got work to do. If we just want to pre-initialize as much
1827                          * as we can without flushing, give up now.
1828                          */
1829                         if (opportunistic)
1830                                 break;
1831
1832                         /* Before waiting, get info_lck and update LogwrtResult */
1833                         SpinLockAcquire(&XLogCtl->info_lck);
1834                         if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
1835                                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
1836                         LogwrtResult = XLogCtl->LogwrtResult;
1837                         SpinLockRelease(&XLogCtl->info_lck);
1838
1839                         /*
1840                          * Now that we have an up-to-date LogwrtResult value, see if we
1841                          * still need to write it or if someone else already did.
1842                          */
1843                         if (LogwrtResult.Write < OldPageRqstPtr)
1844                         {
1845                                 /*
1846                                  * Must acquire write lock. Release WALBufMappingLock first,
1847                                  * to make sure that all insertions that we need to wait for
1848                                  * can finish (up to this same position). Otherwise we risk
1849                                  * deadlock.
1850                                  */
1851                                 LWLockRelease(WALBufMappingLock);
1852
1853                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
1854
1855                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1856
1857                                 LogwrtResult = XLogCtl->LogwrtResult;
1858                                 if (LogwrtResult.Write >= OldPageRqstPtr)
1859                                 {
1860                                         /* OK, someone wrote it already */
1861                                         LWLockRelease(WALWriteLock);
1862                                 }
1863                                 else
1864                                 {
1865                                         /* Have to write it ourselves */
1866                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1867                                         WriteRqst.Write = OldPageRqstPtr;
1868                                         WriteRqst.Flush = 0;
1869                                         XLogWrite(WriteRqst, false);
1870                                         LWLockRelease(WALWriteLock);
1871                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1872                                 }
1873                                 /* Re-acquire WALBufMappingLock and retry */
1874                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
1875                                 continue;
1876                         }
1877                 }
1878
1879                 /*
1880                  * Now the next buffer slot is free and we can set it up to be the
1881                  * next output page.
1882                  */
1883                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
1884                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
1885
1886                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
1887
1888                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1889
1890                 /*
1891                  * Be sure to re-zero the buffer so that bytes beyond what we've
1892                  * written will look like zeroes and not valid XLOG records...
1893                  */
1894                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1895
1896                 /*
1897                  * Fill the new page's header
1898                  */
1899                 NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1900
1901                 /* NewPage->xlp_info = 0; */    /* done by memset */
1902                 NewPage   ->xlp_tli = ThisTimeLineID;
1903                 NewPage   ->xlp_pageaddr = NewPageBeginPtr;
1904
1905                 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
1906
1907                 /*
1908                  * If online backup is not in progress, mark the header to indicate
1909                  * that* WAL records beginning in this page have removable backup
1910                  * blocks.  This allows the WAL archiver to know whether it is safe to
1911                  * compress archived WAL data by transforming full-block records into
1912                  * the non-full-block format.  It is sufficient to record this at the
1913                  * page level because we force a page switch (in fact a segment
1914                  * switch) when starting a backup, so the flag will be off before any
1915                  * records can be written during the backup.  At the end of a backup,
1916                  * the last page will be marked as all unsafe when perhaps only part
1917                  * is unsafe, but at worst the archiver would miss the opportunity to
1918                  * compress a few records.
1919                  */
1920                 if (!Insert->forcePageWrites)
1921                         NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
1922
1923                 /*
1924                  * If first page of an XLOG segment file, make it a long header.
1925                  */
1926                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
1927                 {
1928                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1929
1930                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
1931                         NewLongPage->xlp_seg_size = XLogSegSize;
1932                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1933                         NewPage   ->xlp_info |= XLP_LONG_HEADER;
1934                 }
1935
1936                 /*
1937                  * Make sure the initialization of the page becomes visible to others
1938                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
1939                  * holding a lock.
1940                  */
1941                 pg_write_barrier();
1942
1943                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
1944
1945                 XLogCtl->InitializedUpTo = NewPageEndPtr;
1946
1947                 npages++;
1948         }
1949         LWLockRelease(WALBufMappingLock);
1950
1951 #ifdef WAL_DEBUG
1952         if (npages > 0)
1953         {
1954                 elog(DEBUG1, "initialized %d pages, upto %X/%X",
1955                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
1956         }
1957 #endif
1958 }
1959
1960 /*
1961  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1962  *
1963  * new_segno indicates a log file that has just been filled up (or read
1964  * during recovery). We measure the distance from RedoRecPtr to new_segno
1965  * and see if that exceeds CheckPointSegments.
1966  *
1967  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1968  */
1969 static bool
1970 XLogCheckpointNeeded(XLogSegNo new_segno)
1971 {
1972         XLogSegNo       old_segno;
1973
1974         XLByteToSeg(RedoRecPtr, old_segno);
1975
1976         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
1977                 return true;
1978         return false;
1979 }
1980
1981 /*
1982  * Write and/or fsync the log at least as far as WriteRqst indicates.
1983  *
1984  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1985  * may stop at any convenient boundary (such as a cache or logfile boundary).
1986  * This option allows us to avoid uselessly issuing multiple writes when a
1987  * single one would do.
1988  *
1989  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
1990  * must be called before grabbing the lock, to make sure the data is ready to
1991  * write.
1992  */
1993 static void
1994 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
1995 {
1996         bool            ispartialpage;
1997         bool            last_iteration;
1998         bool            finishing_seg;
1999         bool            use_existent;
2000         int                     curridx;
2001         int                     npages;
2002         int                     startidx;
2003         uint32          startoffset;
2004
2005         /* We should always be inside a critical section here */
2006         Assert(CritSectionCount > 0);
2007
2008         /*
2009          * Update local LogwrtResult (caller probably did this already, but...)
2010          */
2011         LogwrtResult = XLogCtl->LogwrtResult;
2012
2013         /*
2014          * Since successive pages in the xlog cache are consecutively allocated,
2015          * we can usually gather multiple pages together and issue just one
2016          * write() call.  npages is the number of pages we have determined can be
2017          * written together; startidx is the cache block index of the first one,
2018          * and startoffset is the file offset at which it should go. The latter
2019          * two variables are only valid when npages > 0, but we must initialize
2020          * all of them to keep the compiler quiet.
2021          */
2022         npages = 0;
2023         startidx = 0;
2024         startoffset = 0;
2025
2026         /*
2027          * Within the loop, curridx is the cache block index of the page to
2028          * consider writing.  Begin at the buffer containing the next unwritten
2029          * page, or last partially written page.
2030          */
2031         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2032
2033         while (LogwrtResult.Write < WriteRqst.Write)
2034         {
2035                 /*
2036                  * Make sure we're not ahead of the insert process.  This could happen
2037                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2038                  * last page that's been initialized by AdvanceXLInsertBuffer.
2039                  */
2040                 XLogRecPtr      EndPtr = XLogCtl->xlblocks[curridx];
2041
2042                 if (LogwrtResult.Write >= EndPtr)
2043                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2044                                  (uint32) (LogwrtResult.Write >> 32),
2045                                  (uint32) LogwrtResult.Write,
2046                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2047
2048                 /* Advance LogwrtResult.Write to end of current buffer page */
2049                 LogwrtResult.Write = EndPtr;
2050                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2051
2052                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2053                 {
2054                         /*
2055                          * Switch to new logfile segment.  We cannot have any pending
2056                          * pages here (since we dump what we have at segment end).
2057                          */
2058                         Assert(npages == 0);
2059                         if (openLogFile >= 0)
2060                                 XLogFileClose();
2061                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2062
2063                         /* create/use new log file */
2064                         use_existent = true;
2065                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2066                         openLogOff = 0;
2067                 }
2068
2069                 /* Make sure we have the current logfile open */
2070                 if (openLogFile < 0)
2071                 {
2072                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2073                         openLogFile = XLogFileOpen(openLogSegNo);
2074                         openLogOff = 0;
2075                 }
2076
2077                 /* Add current page to the set of pending pages-to-dump */
2078                 if (npages == 0)
2079                 {
2080                         /* first of group */
2081                         startidx = curridx;
2082                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2083                 }
2084                 npages++;
2085
2086                 /*
2087                  * Dump the set if this will be the last loop iteration, or if we are
2088                  * at the last page of the cache area (since the next page won't be
2089                  * contiguous in memory), or if we are at the end of the logfile
2090                  * segment.
2091                  */
2092                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2093
2094                 finishing_seg = !ispartialpage &&
2095                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2096
2097                 if (last_iteration ||
2098                         curridx == XLogCtl->XLogCacheBlck ||
2099                         finishing_seg)
2100                 {
2101                         char       *from;
2102                         Size            nbytes;
2103                         Size            nleft;
2104                         int                     written;
2105
2106                         /* Need to seek in the file? */
2107                         if (openLogOff != startoffset)
2108                         {
2109                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2110                                         ereport(PANIC,
2111                                                         (errcode_for_file_access(),
2112                                          errmsg("could not seek in log file %s to offset %u: %m",
2113                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2114                                                         startoffset)));
2115                                 openLogOff = startoffset;
2116                         }
2117
2118                         /* OK to write the page(s) */
2119                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2120                         nbytes = npages * (Size) XLOG_BLCKSZ;
2121                         nleft = nbytes;
2122                         do
2123                         {
2124                                 errno = 0;
2125                                 written = write(openLogFile, from, nleft);
2126                                 if (written <= 0)
2127                                 {
2128                                         if (errno == EINTR)
2129                                                 continue;
2130                                         ereport(PANIC,
2131                                                         (errcode_for_file_access(),
2132                                                          errmsg("could not write to log file %s "
2133                                                                         "at offset %u, length %zu: %m",
2134                                                                  XLogFileNameP(ThisTimeLineID, openLogSegNo),
2135                                                                         openLogOff, nbytes)));
2136                                 }
2137                                 nleft -= written;
2138                                 from += written;
2139                         } while (nleft > 0);
2140
2141                         /* Update state for write */
2142                         openLogOff += nbytes;
2143                         npages = 0;
2144
2145                         /*
2146                          * If we just wrote the whole last page of a logfile segment,
2147                          * fsync the segment immediately.  This avoids having to go back
2148                          * and re-open prior segments when an fsync request comes along
2149                          * later. Doing it here ensures that one and only one backend will
2150                          * perform this fsync.
2151                          *
2152                          * This is also the right place to notify the Archiver that the
2153                          * segment is ready to copy to archival storage, and to update the
2154                          * timer for archive_timeout, and to signal for a checkpoint if
2155                          * too many logfile segments have been used since the last
2156                          * checkpoint.
2157                          */
2158                         if (finishing_seg)
2159                         {
2160                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2161
2162                                 /* signal that we need to wakeup walsenders later */
2163                                 WalSndWakeupRequest();
2164
2165                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2166
2167                                 if (XLogArchivingActive())
2168                                         XLogArchiveNotifySeg(openLogSegNo);
2169
2170                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2171
2172                                 /*
2173                                  * Request a checkpoint if we've consumed too much xlog since
2174                                  * the last one.  For speed, we first check using the local
2175                                  * copy of RedoRecPtr, which might be out of date; if it looks
2176                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2177                                  * recheck.
2178                                  */
2179                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2180                                 {
2181                                         (void) GetRedoRecPtr();
2182                                         if (XLogCheckpointNeeded(openLogSegNo))
2183                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2184                                 }
2185                         }
2186                 }
2187
2188                 if (ispartialpage)
2189                 {
2190                         /* Only asked to write a partial page */
2191                         LogwrtResult.Write = WriteRqst.Write;
2192                         break;
2193                 }
2194                 curridx = NextBufIdx(curridx);
2195
2196                 /* If flexible, break out of loop as soon as we wrote something */
2197                 if (flexible && npages == 0)
2198                         break;
2199         }
2200
2201         Assert(npages == 0);
2202
2203         /*
2204          * If asked to flush, do so
2205          */
2206         if (LogwrtResult.Flush < WriteRqst.Flush &&
2207                 LogwrtResult.Flush < LogwrtResult.Write)
2208
2209         {
2210                 /*
2211                  * Could get here without iterating above loop, in which case we might
2212                  * have no open file or the wrong one.  However, we do not need to
2213                  * fsync more than one file.
2214                  */
2215                 if (sync_method != SYNC_METHOD_OPEN &&
2216                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2217                 {
2218                         if (openLogFile >= 0 &&
2219                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2220                                 XLogFileClose();
2221                         if (openLogFile < 0)
2222                         {
2223                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2224                                 openLogFile = XLogFileOpen(openLogSegNo);
2225                                 openLogOff = 0;
2226                         }
2227
2228                         issue_xlog_fsync(openLogFile, openLogSegNo);
2229                 }
2230
2231                 /* signal that we need to wakeup walsenders later */
2232                 WalSndWakeupRequest();
2233
2234                 LogwrtResult.Flush = LogwrtResult.Write;
2235         }
2236
2237         /*
2238          * Update shared-memory status
2239          *
2240          * We make sure that the shared 'request' values do not fall behind the
2241          * 'result' values.  This is not absolutely essential, but it saves some
2242          * code in a couple of places.
2243          */
2244         {
2245                 SpinLockAcquire(&XLogCtl->info_lck);
2246                 XLogCtl->LogwrtResult = LogwrtResult;
2247                 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2248                         XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2249                 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2250                         XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2251                 SpinLockRelease(&XLogCtl->info_lck);
2252         }
2253 }
2254
2255 /*
2256  * Record the LSN for an asynchronous transaction commit/abort
2257  * and nudge the WALWriter if there is work for it to do.
2258  * (This should not be called for synchronous commits.)
2259  */
2260 void
2261 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2262 {
2263         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2264         bool            sleeping;
2265
2266         SpinLockAcquire(&XLogCtl->info_lck);
2267         LogwrtResult = XLogCtl->LogwrtResult;
2268         sleeping = XLogCtl->WalWriterSleeping;
2269         if (XLogCtl->asyncXactLSN < asyncXactLSN)
2270                 XLogCtl->asyncXactLSN = asyncXactLSN;
2271         SpinLockRelease(&XLogCtl->info_lck);
2272
2273         /*
2274          * If the WALWriter is sleeping, we should kick it to make it come out of
2275          * low-power mode.  Otherwise, determine whether there's a full page of
2276          * WAL available to write.
2277          */
2278         if (!sleeping)
2279         {
2280                 /* back off to last completed page boundary */
2281                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2282
2283                 /* if we have already flushed that far, we're done */
2284                 if (WriteRqstPtr <= LogwrtResult.Flush)
2285                         return;
2286         }
2287
2288         /*
2289          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2290          * to come out of low-power mode so that this async commit will reach disk
2291          * within the expected amount of time.
2292          */
2293         if (ProcGlobal->walwriterLatch)
2294                 SetLatch(ProcGlobal->walwriterLatch);
2295 }
2296
2297 /*
2298  * Record the LSN up to which we can remove WAL because it's not required by
2299  * any replication slot.
2300  */
2301 void
2302 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2303 {
2304         SpinLockAcquire(&XLogCtl->info_lck);
2305         XLogCtl->replicationSlotMinLSN = lsn;
2306         SpinLockRelease(&XLogCtl->info_lck);
2307 }
2308
2309
2310 /*
2311  * Return the oldest LSN we must retain to satisfy the needs of some
2312  * replication slot.
2313  */
2314 static XLogRecPtr
2315 XLogGetReplicationSlotMinimumLSN(void)
2316 {
2317         XLogRecPtr      retval;
2318
2319         SpinLockAcquire(&XLogCtl->info_lck);
2320         retval = XLogCtl->replicationSlotMinLSN;
2321         SpinLockRelease(&XLogCtl->info_lck);
2322
2323         return retval;
2324 }
2325
2326 /*
2327  * Advance minRecoveryPoint in control file.
2328  *
2329  * If we crash during recovery, we must reach this point again before the
2330  * database is consistent.
2331  *
2332  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2333  * is only updated if it's not already greater than or equal to 'lsn'.
2334  */
2335 static void
2336 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2337 {
2338         /* Quick check using our local copy of the variable */
2339         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2340                 return;
2341
2342         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2343
2344         /* update local copy */
2345         minRecoveryPoint = ControlFile->minRecoveryPoint;
2346         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2347
2348         /*
2349          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2350          * i.e., we're doing crash recovery.  We never modify the control file's
2351          * value in that case, so we can short-circuit future checks here too.
2352          */
2353         if (minRecoveryPoint == 0)
2354                 updateMinRecoveryPoint = false;
2355         else if (force || minRecoveryPoint < lsn)
2356         {
2357                 XLogRecPtr      newMinRecoveryPoint;
2358                 TimeLineID      newMinRecoveryPointTLI;
2359
2360                 /*
2361                  * To avoid having to update the control file too often, we update it
2362                  * all the way to the last record being replayed, even though 'lsn'
2363                  * would suffice for correctness.  This also allows the 'force' case
2364                  * to not need a valid 'lsn' value.
2365                  *
2366                  * Another important reason for doing it this way is that the passed
2367                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2368                  * the caller got it from a corrupted heap page.  Accepting such a
2369                  * value as the min recovery point would prevent us from coming up at
2370                  * all.  Instead, we just log a warning and continue with recovery.
2371                  * (See also the comments about corrupt LSNs in XLogFlush.)
2372                  */
2373                 SpinLockAcquire(&XLogCtl->info_lck);
2374                 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2375                 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2376                 SpinLockRelease(&XLogCtl->info_lck);
2377
2378                 if (!force && newMinRecoveryPoint < lsn)
2379                         elog(WARNING,
2380                            "xlog min recovery request %X/%X is past current point %X/%X",
2381                                  (uint32) (lsn >> 32), (uint32) lsn,
2382                                  (uint32) (newMinRecoveryPoint >> 32),
2383                                  (uint32) newMinRecoveryPoint);
2384
2385                 /* update control file */
2386                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2387                 {
2388                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2389                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2390                         UpdateControlFile();
2391                         minRecoveryPoint = newMinRecoveryPoint;
2392                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2393
2394                         ereport(DEBUG2,
2395                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
2396                                                 (uint32) (minRecoveryPoint >> 32),
2397                                                 (uint32) minRecoveryPoint,
2398                                                 newMinRecoveryPointTLI)));
2399                 }
2400         }
2401         LWLockRelease(ControlFileLock);
2402 }
2403
2404 /*
2405  * Ensure that all XLOG data through the given position is flushed to disk.
2406  *
2407  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2408  * already held, and we try to avoid acquiring it if possible.
2409  */
2410 void
2411 XLogFlush(XLogRecPtr record)
2412 {
2413         XLogRecPtr      WriteRqstPtr;
2414         XLogwrtRqst WriteRqst;
2415
2416         /*
2417          * During REDO, we are reading not writing WAL.  Therefore, instead of
2418          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2419          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2420          * to act this way too, and because when it tries to write the
2421          * end-of-recovery checkpoint, it should indeed flush.
2422          */
2423         if (!XLogInsertAllowed())
2424         {
2425                 UpdateMinRecoveryPoint(record, false);
2426                 return;
2427         }
2428
2429         /* Quick exit if already known flushed */
2430         if (record <= LogwrtResult.Flush)
2431                 return;
2432
2433 #ifdef WAL_DEBUG
2434         if (XLOG_DEBUG)
2435                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2436                          (uint32) (record >> 32), (uint32) record,
2437                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2438                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2439 #endif
2440
2441         START_CRIT_SECTION();
2442
2443         /*
2444          * Since fsync is usually a horribly expensive operation, we try to
2445          * piggyback as much data as we can on each fsync: if we see any more data
2446          * entered into the xlog buffer, we'll write and fsync that too, so that
2447          * the final value of LogwrtResult.Flush is as large as possible. This
2448          * gives us some chance of avoiding another fsync immediately after.
2449          */
2450
2451         /* initialize to given target; may increase below */
2452         WriteRqstPtr = record;
2453
2454         /*
2455          * Now wait until we get the write lock, or someone else does the flush
2456          * for us.
2457          */
2458         for (;;)
2459         {
2460                 XLogRecPtr      insertpos;
2461
2462                 /* read LogwrtResult and update local state */
2463                 SpinLockAcquire(&XLogCtl->info_lck);
2464                 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2465                         WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2466                 LogwrtResult = XLogCtl->LogwrtResult;
2467                 SpinLockRelease(&XLogCtl->info_lck);
2468
2469                 /* done already? */
2470                 if (record <= LogwrtResult.Flush)
2471                         break;
2472
2473                 /*
2474                  * Before actually performing the write, wait for all in-flight
2475                  * insertions to the pages we're about to write to finish.
2476                  */
2477                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2478
2479                 /*
2480                  * Try to get the write lock. If we can't get it immediately, wait
2481                  * until it's released, and recheck if we still need to do the flush
2482                  * or if the backend that held the lock did it for us already. This
2483                  * helps to maintain a good rate of group committing when the system
2484                  * is bottlenecked by the speed of fsyncing.
2485                  */
2486                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2487                 {
2488                         /*
2489                          * The lock is now free, but we didn't acquire it yet. Before we
2490                          * do, loop back to check if someone else flushed the record for
2491                          * us already.
2492                          */
2493                         continue;
2494                 }
2495
2496                 /* Got the lock; recheck whether request is satisfied */
2497                 LogwrtResult = XLogCtl->LogwrtResult;
2498                 if (record <= LogwrtResult.Flush)
2499                 {
2500                         LWLockRelease(WALWriteLock);
2501                         break;
2502                 }
2503
2504                 /*
2505                  * Sleep before flush! By adding a delay here, we may give further
2506                  * backends the opportunity to join the backlog of group commit
2507                  * followers; this can significantly improve transaction throughput,
2508                  * at the risk of increasing transaction latency.
2509                  *
2510                  * We do not sleep if enableFsync is not turned on, nor if there are
2511                  * fewer than CommitSiblings other backends with active transactions.
2512                  */
2513                 if (CommitDelay > 0 && enableFsync &&
2514                         MinimumActiveBackends(CommitSiblings))
2515                 {
2516                         pg_usleep(CommitDelay);
2517
2518                         /*
2519                          * Re-check how far we can now flush the WAL. It's generally not
2520                          * safe to call WaitXLogInsetionsToFinish while holding
2521                          * WALWriteLock, because an in-progress insertion might need to
2522                          * also grab WALWriteLock to make progress. But we know that all
2523                          * the insertions up to insertpos have already finished, because
2524                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
2525                          * We're only calling it again to allow insertpos to be moved
2526                          * further forward, not to actually wait for anyone.
2527                          */
2528                         insertpos = WaitXLogInsertionsToFinish(insertpos);
2529                 }
2530
2531                 /* try to write/flush later additions to XLOG as well */
2532                 WriteRqst.Write = insertpos;
2533                 WriteRqst.Flush = insertpos;
2534
2535                 XLogWrite(WriteRqst, false);
2536
2537                 LWLockRelease(WALWriteLock);
2538                 /* done */
2539                 break;
2540         }
2541
2542         END_CRIT_SECTION();
2543
2544         /* wake up walsenders now that we've released heavily contended locks */
2545         WalSndWakeupProcessRequests();
2546
2547         /*
2548          * If we still haven't flushed to the request point then we have a
2549          * problem; most likely, the requested flush point is past end of XLOG.
2550          * This has been seen to occur when a disk page has a corrupted LSN.
2551          *
2552          * Formerly we treated this as a PANIC condition, but that hurts the
2553          * system's robustness rather than helping it: we do not want to take down
2554          * the whole system due to corruption on one data page.  In particular, if
2555          * the bad page is encountered again during recovery then we would be
2556          * unable to restart the database at all!  (This scenario actually
2557          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2558          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2559          * the only time we can reach here during recovery is while flushing the
2560          * end-of-recovery checkpoint record, and we don't expect that to have a
2561          * bad LSN.
2562          *
2563          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2564          * since xact.c calls this routine inside a critical section.  However,
2565          * calls from bufmgr.c are not within critical sections and so we will not
2566          * force a restart for a bad LSN on a data page.
2567          */
2568         if (LogwrtResult.Flush < record)
2569                 elog(ERROR,
2570                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2571                          (uint32) (record >> 32), (uint32) record,
2572                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2573 }
2574
2575 /*
2576  * Flush xlog, but without specifying exactly where to flush to.
2577  *
2578  * We normally flush only completed blocks; but if there is nothing to do on
2579  * that basis, we check for unflushed async commits in the current incomplete
2580  * block, and flush through the latest one of those.  Thus, if async commits
2581  * are not being used, we will flush complete blocks only.  We can guarantee
2582  * that async commits reach disk after at most three cycles; normally only
2583  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
2584  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
2585  * difference only with very high load or long wal_writer_delay, but imposes
2586  * one extra cycle for the worst case for async commits.)
2587  *
2588  * This routine is invoked periodically by the background walwriter process.
2589  *
2590  * Returns TRUE if we flushed anything.
2591  */
2592 bool
2593 XLogBackgroundFlush(void)
2594 {
2595         XLogRecPtr      WriteRqstPtr;
2596         bool            flexible = true;
2597         bool            wrote_something = false;
2598
2599         /* XLOG doesn't need flushing during recovery */
2600         if (RecoveryInProgress())
2601                 return false;
2602
2603         /* read LogwrtResult and update local state */
2604         SpinLockAcquire(&XLogCtl->info_lck);
2605         LogwrtResult = XLogCtl->LogwrtResult;
2606         WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2607         SpinLockRelease(&XLogCtl->info_lck);
2608
2609         /* back off to last completed page boundary */
2610         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2611
2612         /* if we have already flushed that far, consider async commit records */
2613         if (WriteRqstPtr <= LogwrtResult.Flush)
2614         {
2615                 SpinLockAcquire(&XLogCtl->info_lck);
2616                 WriteRqstPtr = XLogCtl->asyncXactLSN;
2617                 SpinLockRelease(&XLogCtl->info_lck);
2618                 flexible = false;               /* ensure it all gets written */
2619         }
2620
2621         /*
2622          * If already known flushed, we're done. Just need to check if we are
2623          * holding an open file handle to a logfile that's no longer in use,
2624          * preventing the file from being deleted.
2625          */
2626         if (WriteRqstPtr <= LogwrtResult.Flush)
2627         {
2628                 if (openLogFile >= 0)
2629                 {
2630                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2631                         {
2632                                 XLogFileClose();
2633                         }
2634                 }
2635                 return false;
2636         }
2637
2638 #ifdef WAL_DEBUG
2639         if (XLOG_DEBUG)
2640                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2641                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
2642                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2643                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2644 #endif
2645
2646         START_CRIT_SECTION();
2647
2648         /* now wait for any in-progress insertions to finish and get write lock */
2649         WaitXLogInsertionsToFinish(WriteRqstPtr);
2650         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2651         LogwrtResult = XLogCtl->LogwrtResult;
2652         if (WriteRqstPtr > LogwrtResult.Flush)
2653         {
2654                 XLogwrtRqst WriteRqst;
2655
2656                 WriteRqst.Write = WriteRqstPtr;
2657                 WriteRqst.Flush = WriteRqstPtr;
2658                 XLogWrite(WriteRqst, flexible);
2659                 wrote_something = true;
2660         }
2661         LWLockRelease(WALWriteLock);
2662
2663         END_CRIT_SECTION();
2664
2665         /* wake up walsenders now that we've released heavily contended locks */
2666         WalSndWakeupProcessRequests();
2667
2668         /*
2669          * Great, done. To take some work off the critical path, try to initialize
2670          * as many of the no-longer-needed WAL buffers for future use as we can.
2671          */
2672         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
2673
2674         return wrote_something;
2675 }
2676
2677 /*
2678  * Test whether XLOG data has been flushed up to (at least) the given position.
2679  *
2680  * Returns true if a flush is still needed.  (It may be that someone else
2681  * is already in process of flushing that far, however.)
2682  */
2683 bool
2684 XLogNeedsFlush(XLogRecPtr record)
2685 {
2686         /*
2687          * During recovery, we don't flush WAL but update minRecoveryPoint
2688          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2689          * would need to be updated.
2690          */
2691         if (RecoveryInProgress())
2692         {
2693                 /* Quick exit if already known updated */
2694                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
2695                         return false;
2696
2697                 /*
2698                  * Update local copy of minRecoveryPoint. But if the lock is busy,
2699                  * just return a conservative guess.
2700                  */
2701                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
2702                         return true;
2703                 minRecoveryPoint = ControlFile->minRecoveryPoint;
2704                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2705                 LWLockRelease(ControlFileLock);
2706
2707                 /*
2708                  * An invalid minRecoveryPoint means that we need to recover all the
2709                  * WAL, i.e., we're doing crash recovery.  We never modify the control
2710                  * file's value in that case, so we can short-circuit future checks
2711                  * here too.
2712                  */
2713                 if (minRecoveryPoint == 0)
2714                         updateMinRecoveryPoint = false;
2715
2716                 /* check again */
2717                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
2718                         return false;
2719                 else
2720                         return true;
2721         }
2722
2723         /* Quick exit if already known flushed */
2724         if (record <= LogwrtResult.Flush)
2725                 return false;
2726
2727         /* read LogwrtResult and update local state */
2728         SpinLockAcquire(&XLogCtl->info_lck);
2729         LogwrtResult = XLogCtl->LogwrtResult;
2730         SpinLockRelease(&XLogCtl->info_lck);
2731
2732         /* check again */
2733         if (record <= LogwrtResult.Flush)
2734                 return false;
2735
2736         return true;
2737 }
2738
2739 /*
2740  * Create a new XLOG file segment, or open a pre-existing one.
2741  *
2742  * log, seg: identify segment to be created/opened.
2743  *
2744  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2745  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2746  * file was used.
2747  *
2748  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2749  * place.  This should be TRUE except during bootstrap log creation.  The
2750  * caller must *not* hold the lock at call.
2751  *
2752  * Returns FD of opened file.
2753  *
2754  * Note: errors here are ERROR not PANIC because we might or might not be
2755  * inside a critical section (eg, during checkpoint there is no reason to
2756  * take down the system on failure).  They will promote to PANIC if we are
2757  * in a critical section.
2758  */
2759 int
2760 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
2761 {
2762         char            path[MAXPGPATH];
2763         char            tmppath[MAXPGPATH];
2764         char            zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
2765         char       *zbuffer;
2766         XLogSegNo       installed_segno;
2767         int                     max_advance;
2768         int                     fd;
2769         int                     nbytes;
2770
2771         XLogFilePath(path, ThisTimeLineID, logsegno);
2772
2773         /*
2774          * Try to use existent file (checkpoint maker may have created it already)
2775          */
2776         if (*use_existent)
2777         {
2778                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2779                                                    S_IRUSR | S_IWUSR);
2780                 if (fd < 0)
2781                 {
2782                         if (errno != ENOENT)
2783                                 ereport(ERROR,
2784                                                 (errcode_for_file_access(),
2785                                                  errmsg("could not open file \"%s\": %m", path)));
2786                 }
2787                 else
2788                         return fd;
2789         }
2790
2791         /*
2792          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2793          * another process is doing the same thing.  If so, we will end up
2794          * pre-creating an extra log segment.  That seems OK, and better than
2795          * holding the lock throughout this lengthy process.
2796          */
2797         elog(DEBUG2, "creating and filling new WAL file");
2798
2799         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2800
2801         unlink(tmppath);
2802
2803         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2804         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2805                                            S_IRUSR | S_IWUSR);
2806         if (fd < 0)
2807                 ereport(ERROR,
2808                                 (errcode_for_file_access(),
2809                                  errmsg("could not create file \"%s\": %m", tmppath)));
2810
2811         /*
2812          * Zero-fill the file.  We have to do this the hard way to ensure that all
2813          * the file space has really been allocated --- on platforms that allow
2814          * "holes" in files, just seeking to the end doesn't allocate intermediate
2815          * space.  This way, we know that we have all the space and (after the
2816          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2817          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2818          * log file.
2819          *
2820          * Note: ensure the buffer is reasonably well-aligned; this may save a few
2821          * cycles transferring data to the kernel.
2822          */
2823         zbuffer = (char *) MAXALIGN(zbuffer_raw);
2824         memset(zbuffer, 0, XLOG_BLCKSZ);
2825         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2826         {
2827                 errno = 0;
2828                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2829                 {
2830                         int                     save_errno = errno;
2831
2832                         /*
2833                          * If we fail to make the file, delete it to release disk space
2834                          */
2835                         unlink(tmppath);
2836
2837                         close(fd);
2838
2839                         /* if write didn't set errno, assume problem is no disk space */
2840                         errno = save_errno ? save_errno : ENOSPC;
2841
2842                         ereport(ERROR,
2843                                         (errcode_for_file_access(),
2844                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2845                 }
2846         }
2847
2848         if (pg_fsync(fd) != 0)
2849         {
2850                 close(fd);
2851                 ereport(ERROR,
2852                                 (errcode_for_file_access(),
2853                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2854         }
2855
2856         if (close(fd))
2857                 ereport(ERROR,
2858                                 (errcode_for_file_access(),
2859                                  errmsg("could not close file \"%s\": %m", tmppath)));
2860
2861         /*
2862          * Now move the segment into place with its final name.
2863          *
2864          * If caller didn't want to use a pre-existing file, get rid of any
2865          * pre-existing file.  Otherwise, cope with possibility that someone else
2866          * has created the file while we were filling ours: if so, use ours to
2867          * pre-create a future log segment.
2868          */
2869         installed_segno = logsegno;
2870         max_advance = XLOGfileslop;
2871         if (!InstallXLogFileSegment(&installed_segno, tmppath,
2872                                                                 *use_existent, &max_advance,
2873                                                                 use_lock))
2874         {
2875                 /*
2876                  * No need for any more future segments, or InstallXLogFileSegment()
2877                  * failed to rename the file into place. If the rename failed, opening
2878                  * the file below will fail.
2879                  */
2880                 unlink(tmppath);
2881         }
2882
2883         /* Set flag to tell caller there was no existent file */
2884         *use_existent = false;
2885
2886         /* Now open original target segment (might not be file I just made) */
2887         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2888                                            S_IRUSR | S_IWUSR);
2889         if (fd < 0)
2890                 ereport(ERROR,
2891                                 (errcode_for_file_access(),
2892                                  errmsg("could not open file \"%s\": %m", path)));
2893
2894         elog(DEBUG2, "done creating and filling new WAL file");
2895
2896         return fd;
2897 }
2898
2899 /*
2900  * Create a new XLOG file segment by copying a pre-existing one.
2901  *
2902  * destsegno: identify segment to be created.
2903  *
2904  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2905  *              a different timeline)
2906  *
2907  * Currently this is only used during recovery, and so there are no locking
2908  * considerations.  But we should be just as tense as XLogFileInit to avoid
2909  * emplacing a bogus file.
2910  */
2911 static void
2912 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
2913 {
2914         char            path[MAXPGPATH];
2915         char            tmppath[MAXPGPATH];
2916         char            buffer[XLOG_BLCKSZ];
2917         int                     srcfd;
2918         int                     fd;
2919         int                     nbytes;
2920
2921         /*
2922          * Open the source file
2923          */
2924         XLogFilePath(path, srcTLI, srcsegno);
2925         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
2926         if (srcfd < 0)
2927                 ereport(ERROR,
2928                                 (errcode_for_file_access(),
2929                                  errmsg("could not open file \"%s\": %m", path)));
2930
2931         /*
2932          * Copy into a temp file name.
2933          */
2934         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2935
2936         unlink(tmppath);
2937
2938         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2939         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2940                                                    S_IRUSR | S_IWUSR);
2941         if (fd < 0)
2942                 ereport(ERROR,
2943                                 (errcode_for_file_access(),
2944                                  errmsg("could not create file \"%s\": %m", tmppath)));
2945
2946         /*
2947          * Do the data copying.
2948          */
2949         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2950         {
2951                 errno = 0;
2952                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2953                 {
2954                         if (errno != 0)
2955                                 ereport(ERROR,
2956                                                 (errcode_for_file_access(),
2957                                                  errmsg("could not read file \"%s\": %m", path)));
2958                         else
2959                                 ereport(ERROR,
2960                                                 (errmsg("not enough data in file \"%s\"", path)));
2961                 }
2962                 errno = 0;
2963                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2964                 {
2965                         int                     save_errno = errno;
2966
2967                         /*
2968                          * If we fail to make the file, delete it to release disk space
2969                          */
2970                         unlink(tmppath);
2971                         /* if write didn't set errno, assume problem is no disk space */
2972                         errno = save_errno ? save_errno : ENOSPC;
2973
2974                         ereport(ERROR,
2975                                         (errcode_for_file_access(),
2976                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2977                 }
2978         }
2979
2980         if (pg_fsync(fd) != 0)
2981                 ereport(ERROR,
2982                                 (errcode_for_file_access(),
2983                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2984
2985         if (CloseTransientFile(fd))
2986                 ereport(ERROR,
2987                                 (errcode_for_file_access(),
2988                                  errmsg("could not close file \"%s\": %m", tmppath)));
2989
2990         CloseTransientFile(srcfd);
2991
2992         /*
2993          * Now move the segment into place with its final name.
2994          */
2995         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
2996                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2997 }
2998
2999 /*
3000  * Install a new XLOG segment file as a current or future log segment.
3001  *
3002  * This is used both to install a newly-created segment (which has a temp
3003  * filename while it's being created) and to recycle an old segment.
3004  *
3005  * *segno: identify segment to install as (or first possible target).
3006  * When find_free is TRUE, this is modified on return to indicate the
3007  * actual installation location or last segment searched.
3008  *
3009  * tmppath: initial name of file to install.  It will be renamed into place.
3010  *
3011  * find_free: if TRUE, install the new segment at the first empty segno
3012  * number at or after the passed numbers.  If FALSE, install the new segment
3013  * exactly where specified, deleting any existing segment file there.
3014  *
3015  * *max_advance: maximum number of segno slots to advance past the starting
3016  * point.  Fail if no free slot is found in this range.  On return, reduced
3017  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
3018  * when find_free is FALSE.)
3019  *
3020  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3021  * place.  This should be TRUE except during bootstrap log creation.  The
3022  * caller must *not* hold the lock at call.
3023  *
3024  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3025  * max_advance limit was exceeded, or an error occurred while renaming the
3026  * file into place.
3027  */
3028 static bool
3029 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3030                                            bool find_free, int *max_advance,
3031                                            bool use_lock)
3032 {
3033         char            path[MAXPGPATH];
3034         struct stat stat_buf;
3035
3036         XLogFilePath(path, ThisTimeLineID, *segno);
3037
3038         /*
3039          * We want to be sure that only one process does this at a time.
3040          */
3041         if (use_lock)
3042                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3043
3044         if (!find_free)
3045         {
3046                 /* Force installation: get rid of any pre-existing segment file */
3047                 unlink(path);
3048         }
3049         else
3050         {
3051                 /* Find a free slot to put it in */
3052                 while (stat(path, &stat_buf) == 0)
3053                 {
3054                         if (*max_advance <= 0)
3055                         {
3056                                 /* Failed to find a free slot within specified range */
3057                                 if (use_lock)
3058                                         LWLockRelease(ControlFileLock);
3059                                 return false;
3060                         }
3061                         (*segno)++;
3062                         (*max_advance)--;
3063                         XLogFilePath(path, ThisTimeLineID, *segno);
3064                 }
3065         }
3066
3067         /*
3068          * Prefer link() to rename() here just to be really sure that we don't
3069          * overwrite an existing logfile.  However, there shouldn't be one, so
3070          * rename() is an acceptable substitute except for the truly paranoid.
3071          */
3072 #if HAVE_WORKING_LINK
3073         if (link(tmppath, path) < 0)
3074         {
3075                 if (use_lock)
3076                         LWLockRelease(ControlFileLock);
3077                 ereport(LOG,
3078                                 (errcode_for_file_access(),
3079                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
3080                                                 tmppath, path)));
3081                 return false;
3082         }
3083         unlink(tmppath);
3084 #else
3085         if (rename(tmppath, path) < 0)
3086         {
3087                 if (use_lock)
3088                         LWLockRelease(ControlFileLock);
3089                 ereport(LOG,
3090                                 (errcode_for_file_access(),
3091                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
3092                                                 tmppath, path)));
3093                 return false;
3094         }
3095 #endif
3096
3097         if (use_lock)
3098                 LWLockRelease(ControlFileLock);
3099
3100         return true;
3101 }
3102
3103 /*
3104  * Open a pre-existing logfile segment for writing.
3105  */
3106 int
3107 XLogFileOpen(XLogSegNo segno)
3108 {
3109         char            path[MAXPGPATH];
3110         int                     fd;
3111
3112         XLogFilePath(path, ThisTimeLineID, segno);
3113
3114         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3115                                            S_IRUSR | S_IWUSR);
3116         if (fd < 0)
3117                 ereport(PANIC,
3118                                 (errcode_for_file_access(),
3119                         errmsg("could not open transaction log file \"%s\": %m", path)));
3120
3121         return fd;
3122 }
3123
3124 /*
3125  * Open a logfile segment for reading (during recovery).
3126  *
3127  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3128  * Otherwise, it's assumed to be already available in pg_xlog.
3129  */
3130 static int
3131 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3132                          int source, bool notfoundOk)
3133 {
3134         char            xlogfname[MAXFNAMELEN];
3135         char            activitymsg[MAXFNAMELEN + 16];
3136         char            path[MAXPGPATH];
3137         int                     fd;
3138
3139         XLogFileName(xlogfname, tli, segno);
3140
3141         switch (source)
3142         {
3143                 case XLOG_FROM_ARCHIVE:
3144                         /* Report recovery progress in PS display */
3145                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3146                                          xlogfname);
3147                         set_ps_display(activitymsg, false);
3148
3149                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3150                                                                                                           "RECOVERYXLOG",
3151                                                                                                           XLogSegSize,
3152                                                                                                           InRedo);
3153                         if (!restoredFromArchive)
3154                                 return -1;
3155                         break;
3156
3157                 case XLOG_FROM_PG_XLOG:
3158                 case XLOG_FROM_STREAM:
3159                         XLogFilePath(path, tli, segno);
3160                         restoredFromArchive = false;
3161                         break;
3162
3163                 default:
3164                         elog(ERROR, "invalid XLogFileRead source %d", source);
3165         }
3166
3167         /*
3168          * If the segment was fetched from archival storage, replace the existing
3169          * xlog segment (if any) with the archival version.
3170          */
3171         if (source == XLOG_FROM_ARCHIVE)
3172         {
3173                 KeepFileRestoredFromArchive(path, xlogfname);
3174
3175                 /*
3176                  * Set path to point at the new file in pg_xlog.
3177                  */
3178                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3179         }
3180
3181         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3182         if (fd >= 0)
3183         {
3184                 /* Success! */
3185                 curFileTLI = tli;
3186
3187                 /* Report recovery progress in PS display */
3188                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3189                                  xlogfname);
3190                 set_ps_display(activitymsg, false);
3191
3192                 /* Track source of data in assorted state variables */
3193                 readSource = source;
3194                 XLogReceiptSource = source;
3195                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3196                 if (source != XLOG_FROM_STREAM)
3197                         XLogReceiptTime = GetCurrentTimestamp();
3198
3199                 return fd;
3200         }
3201         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3202                 ereport(PANIC,
3203                                 (errcode_for_file_access(),
3204                                  errmsg("could not open file \"%s\": %m", path)));
3205         return -1;
3206 }
3207
3208 /*
3209  * Open a logfile segment for reading (during recovery).
3210  *
3211  * This version searches for the segment with any TLI listed in expectedTLEs.
3212  */
3213 static int
3214 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3215 {
3216         char            path[MAXPGPATH];
3217         ListCell   *cell;
3218         int                     fd;
3219         List       *tles;
3220
3221         /*
3222          * Loop looking for a suitable timeline ID: we might need to read any of
3223          * the timelines listed in expectedTLEs.
3224          *
3225          * We expect curFileTLI on entry to be the TLI of the preceding file in
3226          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3227          * to go backwards; this prevents us from picking up the wrong file when a
3228          * parent timeline extends to higher segment numbers than the child we
3229          * want to read.
3230          *
3231          * If we haven't read the timeline history file yet, read it now, so that
3232          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3233          * however, unless we actually find a valid segment.  That way if there is
3234          * neither a timeline history file nor a WAL segment in the archive, and
3235          * streaming replication is set up, we'll read the timeline history file
3236          * streamed from the master when we start streaming, instead of recovering
3237          * with a dummy history generated here.
3238          */
3239         if (expectedTLEs)
3240                 tles = expectedTLEs;
3241         else
3242                 tles = readTimeLineHistory(recoveryTargetTLI);
3243
3244         foreach(cell, tles)
3245         {
3246                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3247
3248                 if (tli < curFileTLI)
3249                         break;                          /* don't bother looking at too-old TLIs */
3250
3251                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3252                 {
3253                         fd = XLogFileRead(segno, emode, tli,
3254                                                           XLOG_FROM_ARCHIVE, true);
3255                         if (fd != -1)
3256                         {
3257                                 elog(DEBUG1, "got WAL segment from archive");
3258                                 if (!expectedTLEs)
3259                                         expectedTLEs = tles;
3260                                 return fd;
3261                         }
3262                 }
3263
3264                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3265                 {
3266                         fd = XLogFileRead(segno, emode, tli,
3267                                                           XLOG_FROM_PG_XLOG, true);
3268                         if (fd != -1)
3269                         {
3270                                 if (!expectedTLEs)
3271                                         expectedTLEs = tles;
3272                                 return fd;
3273                         }
3274                 }
3275         }
3276
3277         /* Couldn't find it.  For simplicity, complain about front timeline */
3278         XLogFilePath(path, recoveryTargetTLI, segno);
3279         errno = ENOENT;
3280         ereport(emode,
3281                         (errcode_for_file_access(),
3282                          errmsg("could not open file \"%s\": %m", path)));
3283         return -1;
3284 }
3285
3286 /*
3287  * Close the current logfile segment for writing.
3288  */
3289 static void
3290 XLogFileClose(void)
3291 {
3292         Assert(openLogFile >= 0);
3293
3294         /*
3295          * WAL segment files will not be re-read in normal operation, so we advise
3296          * the OS to release any cached pages.  But do not do so if WAL archiving
3297          * or streaming is active, because archiver and walsender process could
3298          * use the cache to read the WAL segment.
3299          */
3300 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3301         if (!XLogIsNeeded())
3302                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3303 #endif
3304
3305         if (close(openLogFile))
3306                 ereport(PANIC,
3307                                 (errcode_for_file_access(),
3308                                  errmsg("could not close log file %s: %m",
3309                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3310         openLogFile = -1;
3311 }
3312
3313 /*
3314  * Preallocate log files beyond the specified log endpoint.
3315  *
3316  * XXX this is currently extremely conservative, since it forces only one
3317  * future log segment to exist, and even that only if we are 75% done with
3318  * the current one.  This is only appropriate for very low-WAL-volume systems.
3319  * High-volume systems will be OK once they've built up a sufficient set of
3320  * recycled log segments, but the startup transient is likely to include
3321  * a lot of segment creations by foreground processes, which is not so good.
3322  */
3323 static void
3324 PreallocXlogFiles(XLogRecPtr endptr)
3325 {
3326         XLogSegNo       _logSegNo;
3327         int                     lf;
3328         bool            use_existent;
3329
3330         XLByteToPrevSeg(endptr, _logSegNo);
3331         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3332         {
3333                 _logSegNo++;
3334                 use_existent = true;
3335                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3336                 close(lf);
3337                 if (!use_existent)
3338                         CheckpointStats.ckpt_segs_added++;
3339         }
3340 }
3341
3342 /*
3343  * Throws an error if the given log segment has already been removed or
3344  * recycled. The caller should only pass a segment that it knows to have
3345  * existed while the server has been running, as this function always
3346  * succeeds if no WAL segments have been removed since startup.
3347  * 'tli' is only used in the error message.
3348  */
3349 void
3350 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3351 {
3352         XLogSegNo       lastRemovedSegNo;
3353
3354         SpinLockAcquire(&XLogCtl->info_lck);
3355         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3356         SpinLockRelease(&XLogCtl->info_lck);
3357
3358         if (segno <= lastRemovedSegNo)
3359         {
3360                 char            filename[MAXFNAMELEN];
3361
3362                 XLogFileName(filename, tli, segno);
3363                 ereport(ERROR,
3364                                 (errcode_for_file_access(),
3365                                  errmsg("requested WAL segment %s has already been removed",
3366                                                 filename)));
3367         }
3368 }
3369
3370 /*
3371  * Return the last WAL segment removed, or 0 if no segment has been removed
3372  * since startup.
3373  *
3374  * NB: the result can be out of date arbitrarily fast, the caller has to deal
3375  * with that.
3376  */
3377 XLogSegNo
3378 XLogGetLastRemovedSegno(void)
3379 {
3380         XLogSegNo       lastRemovedSegNo;
3381
3382         SpinLockAcquire(&XLogCtl->info_lck);
3383         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3384         SpinLockRelease(&XLogCtl->info_lck);
3385
3386         return lastRemovedSegNo;
3387 }
3388
3389 /*
3390  * Update the last removed segno pointer in shared memory, to reflect
3391  * that the given XLOG file has been removed.
3392  */
3393 static void
3394 UpdateLastRemovedPtr(char *filename)
3395 {
3396         uint32          tli;
3397         XLogSegNo       segno;
3398
3399         XLogFromFileName(filename, &tli, &segno);
3400
3401         SpinLockAcquire(&XLogCtl->info_lck);
3402         if (segno > XLogCtl->lastRemovedSegNo)
3403                 XLogCtl->lastRemovedSegNo = segno;
3404         SpinLockRelease(&XLogCtl->info_lck);
3405 }
3406
3407 /*
3408  * Recycle or remove all log files older or equal to passed segno
3409  *
3410  * endptr is current (or recent) end of xlog; this is used to determine
3411  * whether we want to recycle rather than delete no-longer-wanted log files.
3412  */
3413 static void
3414 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
3415 {
3416         XLogSegNo       endlogSegNo;
3417         int                     max_advance;
3418         DIR                *xldir;
3419         struct dirent *xlde;
3420         char            lastoff[MAXFNAMELEN];
3421         char            path[MAXPGPATH];
3422
3423 #ifdef WIN32
3424         char            newpath[MAXPGPATH];
3425 #endif
3426         struct stat statbuf;
3427
3428         /*
3429          * Initialize info about where to try to recycle to.  We allow recycling
3430          * segments up to XLOGfileslop segments beyond the current XLOG location.
3431          */
3432         XLByteToPrevSeg(endptr, endlogSegNo);
3433         max_advance = XLOGfileslop;
3434
3435         xldir = AllocateDir(XLOGDIR);
3436         if (xldir == NULL)
3437                 ereport(ERROR,
3438                                 (errcode_for_file_access(),
3439                                  errmsg("could not open transaction log directory \"%s\": %m",
3440                                                 XLOGDIR)));
3441
3442         /*
3443          * Construct a filename of the last segment to be kept. The timeline ID
3444          * doesn't matter, we ignore that in the comparison. (During recovery,
3445          * ThisTimeLineID isn't set, so we can't use that.)
3446          */
3447         XLogFileName(lastoff, 0, segno);
3448
3449         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3450                  lastoff);
3451
3452         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3453         {
3454                 /*
3455                  * We ignore the timeline part of the XLOG segment identifiers in
3456                  * deciding whether a segment is still needed.  This ensures that we
3457                  * won't prematurely remove a segment from a parent timeline. We could
3458                  * probably be a little more proactive about removing segments of
3459                  * non-parent timelines, but that would be a whole lot more
3460                  * complicated.
3461                  *
3462                  * We use the alphanumeric sorting property of the filenames to decide
3463                  * which ones are earlier than the lastoff segment.
3464                  */
3465                 if (strlen(xlde->d_name) == 24 &&
3466                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3467                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3468                 {
3469                         if (XLogArchiveCheckDone(xlde->d_name))
3470                         {
3471                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3472
3473                                 /* Update the last removed location in shared memory first */
3474                                 UpdateLastRemovedPtr(xlde->d_name);
3475
3476                                 /*
3477                                  * Before deleting the file, see if it can be recycled as a
3478                                  * future log segment. Only recycle normal files, pg_standby
3479                                  * for example can create symbolic links pointing to a
3480                                  * separate archive directory.
3481                                  */
3482                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3483                                         InstallXLogFileSegment(&endlogSegNo, path,
3484                                                                                    true, &max_advance, true))
3485                                 {
3486                                         ereport(DEBUG2,
3487                                                         (errmsg("recycled transaction log file \"%s\"",
3488                                                                         xlde->d_name)));
3489                                         CheckpointStats.ckpt_segs_recycled++;
3490                                         /* Needn't recheck that slot on future iterations */
3491                                         if (max_advance > 0)
3492                                         {
3493                                                 endlogSegNo++;
3494                                                 max_advance--;
3495                                         }
3496                                 }
3497                                 else
3498                                 {
3499                                         /* No need for any more future segments... */
3500                                         int                     rc;
3501
3502                                         ereport(DEBUG2,
3503                                                         (errmsg("removing transaction log file \"%s\"",
3504                                                                         xlde->d_name)));
3505
3506 #ifdef WIN32
3507
3508                                         /*
3509                                          * On Windows, if another process (e.g another backend)
3510                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
3511                                          * will succeed, but the file will still show up in
3512                                          * directory listing until the last handle is closed. To
3513                                          * avoid confusing the lingering deleted file for a live
3514                                          * WAL file that needs to be archived, rename it before
3515                                          * deleting it.
3516                                          *
3517                                          * If another process holds the file open without
3518                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
3519                                          * again at the next checkpoint.
3520                                          */
3521                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
3522                                         if (rename(path, newpath) != 0)
3523                                         {
3524                                                 ereport(LOG,
3525                                                                 (errcode_for_file_access(),
3526                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
3527                                                                                 path)));
3528                                                 continue;
3529                                         }
3530                                         rc = unlink(newpath);
3531 #else
3532                                         rc = unlink(path);
3533 #endif
3534                                         if (rc != 0)
3535                                         {
3536                                                 ereport(LOG,
3537                                                                 (errcode_for_file_access(),
3538                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
3539                                                                                 path)));
3540                                                 continue;
3541                                         }
3542                                         CheckpointStats.ckpt_segs_removed++;
3543                                 }
3544
3545                                 XLogArchiveCleanup(xlde->d_name);
3546                         }
3547                 }
3548         }
3549
3550         FreeDir(xldir);
3551 }
3552
3553 /*
3554  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3555  * If the latter does not exist, recreate it.
3556  *
3557  * It is not the goal of this function to verify the contents of these
3558  * directories, but to help in cases where someone has performed a cluster
3559  * copy for PITR purposes but omitted pg_xlog from the copy.
3560  *
3561  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3562  * policy decision was made not to.  It is fairly common for pg_xlog to be
3563  * a symlink, and if that was the DBA's intent then automatically making a
3564  * plain directory would result in degraded performance with no notice.
3565  */
3566 static void
3567 ValidateXLOGDirectoryStructure(void)
3568 {
3569         char            path[MAXPGPATH];
3570         struct stat stat_buf;
3571
3572         /* Check for pg_xlog; if it doesn't exist, error out */
3573         if (stat(XLOGDIR, &stat_buf) != 0 ||
3574                 !S_ISDIR(stat_buf.st_mode))
3575                 ereport(FATAL,
3576                                 (errmsg("required WAL directory \"%s\" does not exist",
3577                                                 XLOGDIR)));
3578
3579         /* Check for archive_status */
3580         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3581         if (stat(path, &stat_buf) == 0)
3582         {
3583                 /* Check for weird cases where it exists but isn't a directory */
3584                 if (!S_ISDIR(stat_buf.st_mode))
3585                         ereport(FATAL,
3586                                         (errmsg("required WAL directory \"%s\" does not exist",
3587                                                         path)));
3588         }
3589         else
3590         {
3591                 ereport(LOG,
3592                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3593                 if (mkdir(path, S_IRWXU) < 0)
3594                         ereport(FATAL,
3595                                         (errmsg("could not create missing directory \"%s\": %m",
3596                                                         path)));
3597         }
3598 }
3599
3600 /*
3601  * Remove previous backup history files.  This also retries creation of
3602  * .ready files for any backup history files for which XLogArchiveNotify
3603  * failed earlier.
3604  */
3605 static void
3606 CleanupBackupHistory(void)
3607 {
3608         DIR                *xldir;
3609         struct dirent *xlde;
3610         char            path[MAXPGPATH];
3611
3612         xldir = AllocateDir(XLOGDIR);
3613         if (xldir == NULL)
3614                 ereport(ERROR,
3615                                 (errcode_for_file_access(),
3616                                  errmsg("could not open transaction log directory \"%s\": %m",
3617                                                 XLOGDIR)));
3618
3619         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3620         {
3621                 if (strlen(xlde->d_name) > 24 &&
3622                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3623                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3624                                    ".backup") == 0)
3625                 {
3626                         if (XLogArchiveCheckDone(xlde->d_name))
3627                         {
3628                                 ereport(DEBUG2,
3629                                 (errmsg("removing transaction log backup history file \"%s\"",
3630                                                 xlde->d_name)));
3631                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3632                                 unlink(path);
3633                                 XLogArchiveCleanup(xlde->d_name);
3634                         }
3635                 }
3636         }
3637
3638         FreeDir(xldir);
3639 }
3640
3641 /*
3642  * Attempt to read an XLOG record.
3643  *
3644  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3645  * try to read a record just after the last one previously read.
3646  *
3647  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3648  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3649  * record is available.
3650  *
3651  * The record is copied into readRecordBuf, so that on successful return,
3652  * the returned record pointer always points there.
3653  */
3654 static XLogRecord *
3655 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
3656                    bool fetching_ckpt)
3657 {
3658         XLogRecord *record;
3659         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3660
3661         /* Pass through parameters to XLogPageRead */
3662         private->fetching_ckpt = fetching_ckpt;
3663         private->emode = emode;
3664         private->randAccess = (RecPtr != InvalidXLogRecPtr);
3665
3666         /* This is the first attempt to read this page. */
3667         lastSourceFailed = false;
3668
3669         for (;;)
3670         {
3671                 char       *errormsg;
3672
3673                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
3674                 ReadRecPtr = xlogreader->ReadRecPtr;
3675                 EndRecPtr = xlogreader->EndRecPtr;
3676                 if (record == NULL)
3677                 {
3678                         if (readFile >= 0)
3679                         {
3680                                 close(readFile);
3681                                 readFile = -1;
3682                         }
3683
3684                         /*
3685                          * We only end up here without a message when XLogPageRead()
3686                          * failed - in that case we already logged something. In
3687                          * StandbyMode that only happens if we have been triggered, so we
3688                          * shouldn't loop anymore in that case.
3689                          */
3690                         if (errormsg)
3691                                 ereport(emode_for_corrupt_record(emode,
3692                                                                                                  RecPtr ? RecPtr : EndRecPtr),
3693                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
3694                 }
3695
3696                 /*
3697                  * Check page TLI is one of the expected values.
3698                  */
3699                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3700                 {
3701                         char            fname[MAXFNAMELEN];
3702                         XLogSegNo       segno;
3703                         int32           offset;
3704
3705                         XLByteToSeg(xlogreader->latestPagePtr, segno);
3706                         offset = xlogreader->latestPagePtr % XLogSegSize;
3707                         XLogFileName(fname, xlogreader->readPageTLI, segno);
3708                         ereport(emode_for_corrupt_record(emode,
3709                                                                                          RecPtr ? RecPtr : EndRecPtr),
3710                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
3711                                         xlogreader->latestPageTLI,
3712                                         fname,
3713                                         offset)));
3714                         record = NULL;
3715                 }
3716
3717                 if (record)
3718                 {
3719                         /* Great, got a record */
3720                         return record;
3721                 }
3722                 else
3723                 {
3724                         /* No valid record available from this source */
3725                         lastSourceFailed = true;
3726
3727                         /*
3728                          * If archive recovery was requested, but we were still doing
3729                          * crash recovery, switch to archive recovery and retry using the
3730                          * offline archive. We have now replayed all the valid WAL in
3731                          * pg_xlog, so we are presumably now consistent.
3732                          *
3733                          * We require that there's at least some valid WAL present in
3734                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
3735                          * from the archive, even if pg_xlog is completely empty, but we'd
3736                          * have no idea how far we'd have to replay to reach consistency.
3737                          * So err on the safe side and give up.
3738                          */
3739                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3740                                 !fetching_ckpt)
3741                         {
3742                                 ereport(DEBUG1,
3743                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
3744                                 InArchiveRecovery = true;
3745                                 if (StandbyModeRequested)
3746                                         StandbyMode = true;
3747
3748                                 /* initialize minRecoveryPoint to this record */
3749                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3750                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
3751                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
3752                                 {
3753                                         ControlFile->minRecoveryPoint = EndRecPtr;
3754                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
3755                                 }
3756                                 /* update local copy */
3757                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3758                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3759
3760                                 UpdateControlFile();
3761                                 LWLockRelease(ControlFileLock);
3762
3763                                 CheckRecoveryConsistency();
3764
3765                                 /*
3766                                  * Before we retry, reset lastSourceFailed and currentSource
3767                                  * so that we will check the archive next.
3768                                  */
3769                                 lastSourceFailed = false;
3770                                 currentSource = 0;
3771
3772                                 continue;
3773                         }
3774
3775                         /* In standby mode, loop back to retry. Otherwise, give up. */
3776                         if (StandbyMode && !CheckForStandbyTrigger())
3777                                 continue;
3778                         else
3779                                 return NULL;
3780                 }
3781         }
3782 }
3783
3784 /*
3785  * Scan for new timelines that might have appeared in the archive since we
3786  * started recovery.
3787  *
3788  * If there are any, the function changes recovery target TLI to the latest
3789  * one and returns 'true'.
3790  */
3791 static bool
3792 rescanLatestTimeLine(void)
3793 {
3794         List       *newExpectedTLEs;
3795         bool            found;
3796         ListCell   *cell;
3797         TimeLineID      newtarget;
3798         TimeLineID      oldtarget = recoveryTargetTLI;
3799         TimeLineHistoryEntry *currentTle = NULL;
3800
3801         newtarget = findNewestTimeLine(recoveryTargetTLI);
3802         if (newtarget == recoveryTargetTLI)
3803         {
3804                 /* No new timelines found */
3805                 return false;
3806         }
3807
3808         /*
3809          * Determine the list of expected TLIs for the new TLI
3810          */
3811
3812         newExpectedTLEs = readTimeLineHistory(newtarget);
3813
3814         /*
3815          * If the current timeline is not part of the history of the new timeline,
3816          * we cannot proceed to it.
3817          */
3818         found = false;
3819         foreach(cell, newExpectedTLEs)
3820         {
3821                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
3822
3823                 if (currentTle->tli == recoveryTargetTLI)
3824                 {
3825                         found = true;
3826                         break;
3827                 }
3828         }
3829         if (!found)
3830         {
3831                 ereport(LOG,
3832                                 (errmsg("new timeline %u is not a child of database system timeline %u",
3833                                                 newtarget,
3834                                                 ThisTimeLineID)));
3835                 return false;
3836         }
3837
3838         /*
3839          * The current timeline was found in the history file, but check that the
3840          * next timeline was forked off from it *after* the current recovery
3841          * location.
3842          */
3843         if (currentTle->end < EndRecPtr)
3844         {
3845                 ereport(LOG,
3846                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
3847                                                 newtarget,
3848                                                 ThisTimeLineID,
3849                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
3850                 return false;
3851         }
3852
3853         /* The new timeline history seems valid. Switch target */
3854         recoveryTargetTLI = newtarget;
3855         list_free_deep(expectedTLEs);
3856         expectedTLEs = newExpectedTLEs;
3857
3858         /*
3859          * As in StartupXLOG(), try to ensure we have all the history files
3860          * between the old target and new target in pg_xlog.
3861          */
3862         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
3863
3864         ereport(LOG,
3865                         (errmsg("new target timeline is %u",
3866                                         recoveryTargetTLI)));
3867
3868         return true;
3869 }
3870
3871 /*
3872  * I/O routines for pg_control
3873  *
3874  * *ControlFile is a buffer in shared memory that holds an image of the
3875  * contents of pg_control.  WriteControlFile() initializes pg_control
3876  * given a preloaded buffer, ReadControlFile() loads the buffer from
3877  * the pg_control file (during postmaster or standalone-backend startup),
3878  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
3879  *
3880  * For simplicity, WriteControlFile() initializes the fields of pg_control
3881  * that are related to checking backend/database compatibility, and
3882  * ReadControlFile() verifies they are correct.  We could split out the
3883  * I/O and compatibility-check functions, but there seems no need currently.
3884  */
3885 static void
3886 WriteControlFile(void)
3887 {
3888         int                     fd;
3889         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
3890
3891         /*
3892          * Initialize version and compatibility-check fields
3893          */
3894         ControlFile->pg_control_version = PG_CONTROL_VERSION;
3895         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
3896
3897         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
3898         ControlFile->floatFormat = FLOATFORMAT_VALUE;
3899
3900         ControlFile->blcksz = BLCKSZ;
3901         ControlFile->relseg_size = RELSEG_SIZE;
3902         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
3903         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
3904
3905         ControlFile->nameDataLen = NAMEDATALEN;
3906         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
3907
3908         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
3909         ControlFile->loblksize = LOBLKSIZE;
3910
3911 #ifdef HAVE_INT64_TIMESTAMP
3912         ControlFile->enableIntTimes = true;
3913 #else
3914         ControlFile->enableIntTimes = false;
3915 #endif
3916         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
3917         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
3918
3919         /* Contents are protected with a CRC */
3920         INIT_CRC32C(ControlFile->crc);
3921         COMP_CRC32C(ControlFile->crc,
3922                                 (char *) ControlFile,
3923                                 offsetof(ControlFileData, crc));
3924         FIN_CRC32C(ControlFile->crc);
3925
3926         /*
3927          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
3928          * excess over sizeof(ControlFileData).  This reduces the odds of
3929          * premature-EOF errors when reading pg_control.  We'll still fail when we
3930          * check the contents of the file, but hopefully with a more specific
3931          * error than "couldn't read pg_control".
3932          */
3933         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
3934                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
3935
3936         memset(buffer, 0, PG_CONTROL_SIZE);
3937         memcpy(buffer, ControlFile, sizeof(ControlFileData));
3938
3939         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3940                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3941                                            S_IRUSR | S_IWUSR);
3942         if (fd < 0)
3943                 ereport(PANIC,
3944                                 (errcode_for_file_access(),
3945                                  errmsg("could not create control file \"%s\": %m",
3946                                                 XLOG_CONTROL_FILE)));
3947
3948         errno = 0;
3949         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
3950         {
3951                 /* if write didn't set errno, assume problem is no disk space */
3952                 if (errno == 0)
3953                         errno = ENOSPC;
3954                 ereport(PANIC,
3955                                 (errcode_for_file_access(),
3956                                  errmsg("could not write to control file: %m")));
3957         }
3958
3959         if (pg_fsync(fd) != 0)
3960                 ereport(PANIC,
3961                                 (errcode_for_file_access(),
3962                                  errmsg("could not fsync control file: %m")));
3963
3964         if (close(fd))
3965                 ereport(PANIC,
3966                                 (errcode_for_file_access(),
3967                                  errmsg("could not close control file: %m")));
3968 }
3969
3970 static void
3971 ReadControlFile(void)
3972 {
3973         pg_crc32        crc;
3974         int                     fd;
3975
3976         /*
3977          * Read data...
3978          */
3979         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3980                                            O_RDWR | PG_BINARY,
3981                                            S_IRUSR | S_IWUSR);
3982         if (fd < 0)
3983                 ereport(PANIC,
3984                                 (errcode_for_file_access(),
3985                                  errmsg("could not open control file \"%s\": %m",
3986                                                 XLOG_CONTROL_FILE)));
3987
3988         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
3989                 ereport(PANIC,
3990                                 (errcode_for_file_access(),
3991                                  errmsg("could not read from control file: %m")));
3992
3993         close(fd);
3994
3995         /*
3996          * Check for expected pg_control format version.  If this is wrong, the
3997          * CRC check will likely fail because we'll be checking the wrong number
3998          * of bytes.  Complaining about wrong version will probably be more
3999          * enlightening than complaining about wrong CRC.
4000          */
4001
4002         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4003                 ereport(FATAL,
4004                                 (errmsg("database files are incompatible with server"),
4005                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4006                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4007                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4008                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4009                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4010
4011         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4012                 ereport(FATAL,
4013                                 (errmsg("database files are incompatible with server"),
4014                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4015                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4016                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4017                                  errhint("It looks like you need to initdb.")));
4018
4019         /* Now check the CRC. */
4020         INIT_CRC32C(crc);
4021         COMP_CRC32C(crc,
4022                                 (char *) ControlFile,
4023                                 offsetof(ControlFileData, crc));
4024         FIN_CRC32C(crc);
4025
4026         if (!EQ_CRC32C(crc, ControlFile->crc))
4027                 ereport(FATAL,
4028                                 (errmsg("incorrect checksum in control file")));
4029
4030         /*
4031          * Do compatibility checking immediately.  If the database isn't
4032          * compatible with the backend executable, we want to abort before we can
4033          * possibly do any damage.
4034          */
4035         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4036                 ereport(FATAL,
4037                                 (errmsg("database files are incompatible with server"),
4038                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4039                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4040                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4041                                  errhint("It looks like you need to initdb.")));
4042         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4043                 ereport(FATAL,
4044                                 (errmsg("database files are incompatible with server"),
4045                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4046                                          " but the server was compiled with MAXALIGN %d.",
4047                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4048                                  errhint("It looks like you need to initdb.")));
4049         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4050                 ereport(FATAL,
4051                                 (errmsg("database files are incompatible with server"),
4052                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4053                                  errhint("It looks like you need to initdb.")));
4054         if (ControlFile->blcksz != BLCKSZ)
4055                 ereport(FATAL,
4056                                 (errmsg("database files are incompatible with server"),
4057                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4058                                            " but the server was compiled with BLCKSZ %d.",
4059                                            ControlFile->blcksz, BLCKSZ),
4060                                  errhint("It looks like you need to recompile or initdb.")));
4061         if (ControlFile->relseg_size != RELSEG_SIZE)
4062                 ereport(FATAL,
4063                                 (errmsg("database files are incompatible with server"),
4064                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4065                                   " but the server was compiled with RELSEG_SIZE %d.",
4066                                   ControlFile->relseg_size, RELSEG_SIZE),
4067                                  errhint("It looks like you need to recompile or initdb.")));
4068         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4069                 ereport(FATAL,
4070                                 (errmsg("database files are incompatible with server"),
4071                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4072                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4073                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4074                                  errhint("It looks like you need to recompile or initdb.")));
4075         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4076                 ereport(FATAL,
4077                                 (errmsg("database files are incompatible with server"),
4078                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4079                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4080                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4081                                  errhint("It looks like you need to recompile or initdb.")));
4082         if (ControlFile->nameDataLen != NAMEDATALEN)
4083                 ereport(FATAL,
4084                                 (errmsg("database files are incompatible with server"),
4085                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4086                                   " but the server was compiled with NAMEDATALEN %d.",
4087                                   ControlFile->nameDataLen, NAMEDATALEN),
4088                                  errhint("It looks like you need to recompile or initdb.")));
4089         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4090                 ereport(FATAL,
4091                                 (errmsg("database files are incompatible with server"),
4092                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4093                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4094                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4095                                  errhint("It looks like you need to recompile or initdb.")));
4096         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4097                 ereport(FATAL,
4098                                 (errmsg("database files are incompatible with server"),
4099                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4100                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4101                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4102                                  errhint("It looks like you need to recompile or initdb.")));
4103         if (ControlFile->loblksize != LOBLKSIZE)
4104                 ereport(FATAL,
4105                                 (errmsg("database files are incompatible with server"),
4106                   errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4107                                         " but the server was compiled with LOBLKSIZE %d.",
4108                                         ControlFile->loblksize, (int) LOBLKSIZE),
4109                                  errhint("It looks like you need to recompile or initdb.")));
4110
4111 #ifdef HAVE_INT64_TIMESTAMP
4112         if (ControlFile->enableIntTimes != true)
4113                 ereport(FATAL,
4114                                 (errmsg("database files are incompatible with server"),
4115                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4116                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4117                                  errhint("It looks like you need to recompile or initdb.")));
4118 #else
4119         if (ControlFile->enableIntTimes != false)
4120                 ereport(FATAL,
4121                                 (errmsg("database files are incompatible with server"),
4122                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4123                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4124                                  errhint("It looks like you need to recompile or initdb.")));
4125 #endif
4126
4127 #ifdef USE_FLOAT4_BYVAL
4128         if (ControlFile->float4ByVal != true)
4129                 ereport(FATAL,
4130                                 (errmsg("database files are incompatible with server"),
4131                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4132                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4133                                  errhint("It looks like you need to recompile or initdb.")));
4134 #else
4135         if (ControlFile->float4ByVal != false)
4136                 ereport(FATAL,
4137                                 (errmsg("database files are incompatible with server"),
4138                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4139                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4140                                  errhint("It looks like you need to recompile or initdb.")));
4141 #endif
4142
4143 #ifdef USE_FLOAT8_BYVAL
4144         if (ControlFile->float8ByVal != true)
4145                 ereport(FATAL,
4146                                 (errmsg("database files are incompatible with server"),
4147                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4148                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4149                                  errhint("It looks like you need to recompile or initdb.")));
4150 #else
4151         if (ControlFile->float8ByVal != false)
4152                 ereport(FATAL,
4153                                 (errmsg("database files are incompatible with server"),
4154                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4155                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4156                                  errhint("It looks like you need to recompile or initdb.")));
4157 #endif
4158
4159         /* Make the initdb settings visible as GUC variables, too */
4160         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4161                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4162 }
4163
4164 void
4165 UpdateControlFile(void)
4166 {
4167         int                     fd;
4168
4169         INIT_CRC32C(ControlFile->crc);
4170         COMP_CRC32C(ControlFile->crc,
4171                                 (char *) ControlFile,
4172                                 offsetof(ControlFileData, crc));
4173         FIN_CRC32C(ControlFile->crc);
4174
4175         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4176                                            O_RDWR | PG_BINARY,
4177                                            S_IRUSR | S_IWUSR);
4178         if (fd < 0)
4179                 ereport(PANIC,
4180                                 (errcode_for_file_access(),
4181                                  errmsg("could not open control file \"%s\": %m",
4182                                                 XLOG_CONTROL_FILE)));
4183
4184         errno = 0;
4185         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4186         {
4187                 /* if write didn't set errno, assume problem is no disk space */
4188                 if (errno == 0)
4189                         errno = ENOSPC;
4190                 ereport(PANIC,
4191                                 (errcode_for_file_access(),
4192                                  errmsg("could not write to control file: %m")));
4193         }
4194
4195         if (pg_fsync(fd) != 0)
4196                 ereport(PANIC,
4197                                 (errcode_for_file_access(),
4198                                  errmsg("could not fsync control file: %m")));
4199
4200         if (close(fd))
4201                 ereport(PANIC,
4202                                 (errcode_for_file_access(),
4203                                  errmsg("could not close control file: %m")));
4204 }
4205
4206 /*
4207  * Returns the unique system identifier from control file.
4208  */
4209 uint64
4210 GetSystemIdentifier(void)
4211 {
4212         Assert(ControlFile != NULL);
4213         return ControlFile->system_identifier;
4214 }
4215
4216 /*
4217  * Are checksums enabled for data pages?
4218  */
4219 bool
4220 DataChecksumsEnabled(void)
4221 {
4222         Assert(ControlFile != NULL);
4223         return (ControlFile->data_checksum_version > 0);
4224 }
4225
4226 /*
4227  * Returns a fake LSN for unlogged relations.
4228  *
4229  * Each call generates an LSN that is greater than any previous value
4230  * returned. The current counter value is saved and restored across clean
4231  * shutdowns, but like unlogged relations, does not survive a crash. This can
4232  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4233  * LSN-like increasing sequence of numbers without writing any WAL.
4234  */
4235 XLogRecPtr
4236 GetFakeLSNForUnloggedRel(void)
4237 {
4238         XLogRecPtr      nextUnloggedLSN;
4239
4240         /* increment the unloggedLSN counter, need SpinLock */
4241         SpinLockAcquire(&XLogCtl->ulsn_lck);
4242         nextUnloggedLSN = XLogCtl->unloggedLSN++;
4243         SpinLockRelease(&XLogCtl->ulsn_lck);
4244
4245         return nextUnloggedLSN;
4246 }
4247
4248 /*
4249  * Auto-tune the number of XLOG buffers.
4250  *
4251  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4252  * a maximum of one XLOG segment (there is little reason to think that more
4253  * is helpful, at least so long as we force an fsync when switching log files)
4254  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4255  * 9.1, when auto-tuning was added).
4256  *
4257  * This should not be called until NBuffers has received its final value.
4258  */
4259 static int
4260 XLOGChooseNumBuffers(void)
4261 {
4262         int                     xbuffers;
4263
4264         xbuffers = NBuffers / 32;
4265         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4266                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4267         if (xbuffers < 8)
4268                 xbuffers = 8;
4269         return xbuffers;
4270 }
4271
4272 /*
4273  * GUC check_hook for wal_buffers
4274  */
4275 bool
4276 check_wal_buffers(int *newval, void **extra, GucSource source)
4277 {
4278         /*
4279          * -1 indicates a request for auto-tune.
4280          */
4281         if (*newval == -1)
4282         {
4283                 /*
4284                  * If we haven't yet changed the boot_val default of -1, just let it
4285                  * be.  We'll fix it when XLOGShmemSize is called.
4286                  */
4287                 if (XLOGbuffers == -1)
4288                         return true;
4289
4290                 /* Otherwise, substitute the auto-tune value */
4291                 *newval = XLOGChooseNumBuffers();
4292         }
4293
4294         /*
4295          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4296          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4297          * the case, we just silently treat such values as a request for the
4298          * minimum.  (We could throw an error instead, but that doesn't seem very
4299          * helpful.)
4300          */
4301         if (*newval < 4)
4302                 *newval = 4;
4303
4304         return true;
4305 }
4306
4307 /*
4308  * Initialization of shared memory for XLOG
4309  */
4310 Size
4311 XLOGShmemSize(void)
4312 {
4313         Size            size;
4314
4315         /*
4316          * If the value of wal_buffers is -1, use the preferred auto-tune value.
4317          * This isn't an amazingly clean place to do this, but we must wait till
4318          * NBuffers has received its final value, and must do it before using the
4319          * value of XLOGbuffers to do anything important.
4320          */
4321         if (XLOGbuffers == -1)
4322         {
4323                 char            buf[32];
4324
4325                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4326                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4327         }
4328         Assert(XLOGbuffers > 0);
4329
4330         /* XLogCtl */
4331         size = sizeof(XLogCtlData);
4332
4333         /* WAL insertion locks, plus alignment */
4334         size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4335         /* xlblocks array */
4336         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4337         /* extra alignment padding for XLOG I/O buffers */
4338         size = add_size(size, XLOG_BLCKSZ);
4339         /* and the buffers themselves */
4340         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4341
4342         /*
4343          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4344          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4345          * routine again below to compute the actual allocation size.
4346          */
4347
4348         return size;
4349 }
4350
4351 void
4352 XLOGShmemInit(void)
4353 {
4354         bool            foundCFile,
4355                                 foundXLog;
4356         char       *allocptr;
4357         int                     i;
4358
4359 #ifdef WAL_DEBUG
4360         /*
4361          * Create a memory context for WAL debugging that's exempt from the
4362          * normal "no pallocs in critical section" rule. Yes, that can lead to a
4363          * PANIC if an allocation fails, but wal_debug is not for production use
4364          * anyway.
4365          */
4366         if (walDebugCxt == NULL)
4367         {
4368                 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4369                                                                                         "WAL Debug",
4370                                                                                         ALLOCSET_DEFAULT_MINSIZE,
4371                                                                                         ALLOCSET_DEFAULT_INITSIZE,
4372                                                                                         ALLOCSET_DEFAULT_MAXSIZE);
4373                 MemoryContextAllowInCriticalSection(walDebugCxt, true);
4374         }
4375 #endif
4376
4377         ControlFile = (ControlFileData *)
4378                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4379         XLogCtl = (XLogCtlData *)
4380                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4381
4382         if (foundCFile || foundXLog)
4383         {
4384                 /* both should be present or neither */
4385                 Assert(foundCFile && foundXLog);
4386
4387                 /* Initialize local copy of WALInsertLocks and register the tranche */
4388                 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
4389                 LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId,
4390                                                           &XLogCtl->Insert.WALInsertLockTranche);
4391                 return;
4392         }
4393         memset(XLogCtl, 0, sizeof(XLogCtlData));
4394
4395         /*
4396          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4397          * multiple of the alignment for same, so no extra alignment padding is
4398          * needed here.
4399          */
4400         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4401         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4402         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4403         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4404
4405
4406         /* WAL insertion locks. Ensure they're aligned to the full padded size */
4407         allocptr += sizeof(WALInsertLockPadded) -
4408                 ((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
4409         WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
4410                 (WALInsertLockPadded *) allocptr;
4411         allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
4412
4413         XLogCtl->Insert.WALInsertLockTrancheId = LWLockNewTrancheId();
4414
4415         XLogCtl->Insert.WALInsertLockTranche.name = "WALInsertLocks";
4416         XLogCtl->Insert.WALInsertLockTranche.array_base = WALInsertLocks;
4417         XLogCtl->Insert.WALInsertLockTranche.array_stride = sizeof(WALInsertLockPadded);
4418
4419         LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId, &XLogCtl->Insert.WALInsertLockTranche);
4420         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
4421         {
4422                 LWLockInitialize(&WALInsertLocks[i].l.lock,
4423                                                  XLogCtl->Insert.WALInsertLockTrancheId);
4424                 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
4425         }
4426
4427         /*
4428          * Align the start of the page buffers to a full xlog block size boundary.
4429          * This simplifies some calculations in XLOG insertion. It is also
4430          * required for O_DIRECT.
4431          */
4432         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
4433         XLogCtl->pages = allocptr;
4434         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4435
4436         /*
4437          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4438          * in additional info.)
4439          */
4440         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4441         XLogCtl->SharedRecoveryInProgress = true;
4442         XLogCtl->SharedHotStandbyActive = false;
4443         XLogCtl->WalWriterSleeping = false;
4444
4445         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
4446         SpinLockInit(&XLogCtl->info_lck);
4447         SpinLockInit(&XLogCtl->ulsn_lck);
4448         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
4449
4450         /*
4451          * If we are not in bootstrap mode, pg_control should already exist. Read
4452          * and validate it immediately (see comments in ReadControlFile() for the
4453          * reasons why).
4454          */
4455         if (!IsBootstrapProcessingMode())
4456                 ReadControlFile();
4457 }
4458
4459 /*
4460  * This func must be called ONCE on system install.  It creates pg_control
4461  * and the initial XLOG segment.
4462  */
4463 void
4464 BootStrapXLOG(void)
4465 {
4466         CheckPoint      checkPoint;
4467         char       *buffer;
4468         XLogPageHeader page;
4469         XLogLongPageHeader longpage;
4470         XLogRecord *record;
4471         char       *recptr;
4472         bool            use_existent;
4473         uint64          sysidentifier;
4474         struct timeval tv;
4475         pg_crc32        crc;
4476
4477         /*
4478          * Select a hopefully-unique system identifier code for this installation.
4479          * We use the result of gettimeofday(), including the fractional seconds
4480          * field, as being about as unique as we can easily get.  (Think not to
4481          * use random(), since it hasn't been seeded and there's no portable way
4482          * to seed it other than the system clock value...)  The upper half of the
4483          * uint64 value is just the tv_sec part, while the lower half contains the
4484          * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
4485          * PID for a little extra uniqueness.  A person knowing this encoding can
4486          * determine the initialization time of the installation, which could
4487          * perhaps be useful sometimes.
4488          */
4489         gettimeofday(&tv, NULL);
4490         sysidentifier = ((uint64) tv.tv_sec) << 32;
4491         sysidentifier |= ((uint64) tv.tv_usec) << 12;
4492         sysidentifier |= getpid() & 0xFFF;
4493
4494         /* First timeline ID is always 1 */
4495         ThisTimeLineID = 1;
4496
4497         /* page buffer must be aligned suitably for O_DIRECT */
4498         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
4499         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
4500         memset(page, 0, XLOG_BLCKSZ);
4501
4502         /*
4503          * Set up information for the initial checkpoint record
4504          *
4505          * The initial checkpoint record is written to the beginning of the WAL
4506          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4507          * used, so that we can use 0/0 to mean "before any valid WAL segment".
4508          */
4509         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
4510         checkPoint.ThisTimeLineID = ThisTimeLineID;
4511         checkPoint.PrevTimeLineID = ThisTimeLineID;
4512         checkPoint.fullPageWrites = fullPageWrites;
4513         checkPoint.nextXidEpoch = 0;
4514         checkPoint.nextXid = FirstNormalTransactionId;
4515         checkPoint.nextOid = FirstBootstrapObjectId;
4516         checkPoint.nextMulti = FirstMultiXactId;
4517         checkPoint.nextMultiOffset = 0;
4518         checkPoint.oldestXid = FirstNormalTransactionId;
4519         checkPoint.oldestXidDB = TemplateDbOid;
4520         checkPoint.oldestMulti = FirstMultiXactId;
4521         checkPoint.oldestMultiDB = TemplateDbOid;
4522         checkPoint.oldestCommitTs = InvalidTransactionId;
4523         checkPoint.newestCommitTs = InvalidTransactionId;
4524         checkPoint.time = (pg_time_t) time(NULL);
4525         checkPoint.oldestActiveXid = InvalidTransactionId;
4526
4527         ShmemVariableCache->nextXid = checkPoint.nextXid;
4528         ShmemVariableCache->nextOid = checkPoint.nextOid;
4529         ShmemVariableCache->oidCount = 0;
4530         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4531         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4532         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
4533         SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
4534
4535         /* Set up the XLOG page header */
4536         page->xlp_magic = XLOG_PAGE_MAGIC;
4537         page->xlp_info = XLP_LONG_HEADER;
4538         page->xlp_tli = ThisTimeLineID;
4539         page->xlp_pageaddr = XLogSegSize;
4540         longpage = (XLogLongPageHeader) page;
4541         longpage->xlp_sysid = sysidentifier;
4542         longpage->xlp_seg_size = XLogSegSize;
4543         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4544
4545         /* Insert the initial checkpoint record */
4546         recptr = ((char *) page + SizeOfXLogLongPHD);
4547         record = (XLogRecord *) recptr;
4548         record->xl_prev = 0;
4549         record->xl_xid = InvalidTransactionId;
4550         record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
4551         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4552         record->xl_rmid = RM_XLOG_ID;
4553         recptr += SizeOfXLogRecord;
4554         /* fill the XLogRecordDataHeaderShort struct */
4555         *(recptr++) = XLR_BLOCK_ID_DATA_SHORT;
4556         *(recptr++) = sizeof(checkPoint);
4557         memcpy(recptr, &checkPoint, sizeof(checkPoint));
4558         recptr += sizeof(checkPoint);
4559         Assert(recptr - (char *) record == record->xl_tot_len);
4560
4561         INIT_CRC32C(crc);
4562         COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
4563         COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
4564         FIN_CRC32C(crc);
4565         record->xl_crc = crc;
4566
4567         /* Create first XLOG segment file */
4568         use_existent = false;
4569         openLogFile = XLogFileInit(1, &use_existent, false);
4570
4571         /* Write the first page with the initial record */
4572         errno = 0;
4573         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4574         {
4575                 /* if write didn't set errno, assume problem is no disk space */
4576                 if (errno == 0)
4577                         errno = ENOSPC;
4578                 ereport(PANIC,
4579                                 (errcode_for_file_access(),
4580                           errmsg("could not write bootstrap transaction log file: %m")));
4581         }
4582
4583         if (pg_fsync(openLogFile) != 0)
4584                 ereport(PANIC,
4585                                 (errcode_for_file_access(),
4586                           errmsg("could not fsync bootstrap transaction log file: %m")));
4587
4588         if (close(openLogFile))
4589                 ereport(PANIC,
4590                                 (errcode_for_file_access(),
4591                           errmsg("could not close bootstrap transaction log file: %m")));
4592
4593         openLogFile = -1;
4594
4595         /* Now create pg_control */
4596
4597         memset(ControlFile, 0, sizeof(ControlFileData));
4598         /* Initialize pg_control status fields */
4599         ControlFile->system_identifier = sysidentifier;
4600         ControlFile->state = DB_SHUTDOWNED;
4601         ControlFile->time = checkPoint.time;
4602         ControlFile->checkPoint = checkPoint.redo;
4603         ControlFile->checkPointCopy = checkPoint;
4604         ControlFile->unloggedLSN = 1;
4605
4606         /* Set important parameter values for use when replaying WAL */
4607         ControlFile->MaxConnections = MaxConnections;
4608         ControlFile->max_worker_processes = max_worker_processes;
4609         ControlFile->max_prepared_xacts = max_prepared_xacts;
4610         ControlFile->max_locks_per_xact = max_locks_per_xact;
4611         ControlFile->wal_level = wal_level;
4612         ControlFile->wal_log_hints = wal_log_hints;
4613         ControlFile->track_commit_timestamp = track_commit_timestamp;
4614         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
4615
4616         /* some additional ControlFile fields are set in WriteControlFile() */
4617
4618         WriteControlFile();
4619
4620         /* Bootstrap the commit log, too */
4621         BootStrapCLOG();
4622         BootStrapCommitTs();
4623         BootStrapSUBTRANS();
4624         BootStrapMultiXact();
4625
4626         pfree(buffer);
4627 }
4628
4629 static char *
4630 str_time(pg_time_t tnow)
4631 {
4632         static char buf[128];
4633
4634         pg_strftime(buf, sizeof(buf),
4635                                 "%Y-%m-%d %H:%M:%S %Z",
4636                                 pg_localtime(&tnow, log_timezone));
4637
4638         return buf;
4639 }
4640
4641 /*
4642  * See if there is a recovery command file (recovery.conf), and if so
4643  * read in parameters for archive recovery and XLOG streaming.
4644  *
4645  * The file is parsed using the main configuration parser.
4646  */
4647 static void
4648 readRecoveryCommandFile(void)
4649 {
4650         FILE       *fd;
4651         TimeLineID      rtli = 0;
4652         bool            rtliGiven = false;
4653         ConfigVariable *item,
4654                            *head = NULL,
4655                            *tail = NULL;
4656         bool            recoveryPauseAtTargetSet = false;
4657         bool            recoveryTargetActionSet = false;
4658
4659
4660         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4661         if (fd == NULL)
4662         {
4663                 if (errno == ENOENT)
4664                         return;                         /* not there, so no archive recovery */
4665                 ereport(FATAL,
4666                                 (errcode_for_file_access(),
4667                                  errmsg("could not open recovery command file \"%s\": %m",
4668                                                 RECOVERY_COMMAND_FILE)));
4669         }
4670
4671         /*
4672          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
4673          * no need to check the return value.
4674          */
4675         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
4676
4677         FreeFile(fd);
4678
4679         for (item = head; item; item = item->next)
4680         {
4681                 if (strcmp(item->name, "restore_command") == 0)
4682                 {
4683                         recoveryRestoreCommand = pstrdup(item->value);
4684                         ereport(DEBUG2,
4685                                         (errmsg_internal("restore_command = '%s'",
4686                                                                          recoveryRestoreCommand)));
4687                 }
4688                 else if (strcmp(item->name, "recovery_end_command") == 0)
4689                 {
4690                         recoveryEndCommand = pstrdup(item->value);
4691                         ereport(DEBUG2,
4692                                         (errmsg_internal("recovery_end_command = '%s'",
4693                                                                          recoveryEndCommand)));
4694                 }
4695                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
4696                 {
4697                         archiveCleanupCommand = pstrdup(item->value);
4698                         ereport(DEBUG2,
4699                                         (errmsg_internal("archive_cleanup_command = '%s'",
4700                                                                          archiveCleanupCommand)));
4701                 }
4702                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
4703                 {
4704                         bool recoveryPauseAtTarget;
4705
4706                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
4707                                 ereport(ERROR,
4708                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4709                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
4710
4711                         ereport(DEBUG2,
4712                                         (errmsg_internal("pause_at_recovery_target = '%s'",
4713                                                                          item->value)));
4714
4715                         recoveryTargetAction = recoveryPauseAtTarget ?
4716                                                                          RECOVERY_TARGET_ACTION_PAUSE :
4717                                                                          RECOVERY_TARGET_ACTION_PROMOTE;
4718
4719                         recoveryPauseAtTargetSet = true;
4720                 }
4721                 else if (strcmp(item->name, "recovery_target_action") == 0)
4722                 {
4723                         if (strcmp(item->value, "pause") == 0)
4724                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
4725                         else if (strcmp(item->value, "promote") == 0)
4726                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
4727                         else if (strcmp(item->value, "shutdown") == 0)
4728                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
4729                         else
4730                                 ereport(ERROR,
4731                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4732                                                  errmsg("invalid value for recovery parameter \"%s\"",
4733                                                                 "recovery_target_action"),
4734                                                  errhint("The allowed values are \"pause\", \"promote\" and \"shutdown\".")));
4735
4736                         ereport(DEBUG2,
4737                                         (errmsg_internal("recovery_target_action = '%s'",
4738                                                                          item->value)));
4739
4740                         recoveryTargetActionSet = true;
4741                 }
4742                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
4743                 {
4744                         rtliGiven = true;
4745                         if (strcmp(item->value, "latest") == 0)
4746                                 rtli = 0;
4747                         else
4748                         {
4749                                 errno = 0;
4750                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
4751                                 if (errno == EINVAL || errno == ERANGE)
4752                                         ereport(FATAL,
4753                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4754                                                                         item->value)));
4755                         }
4756                         if (rtli)
4757                                 ereport(DEBUG2,
4758                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
4759                         else
4760                                 ereport(DEBUG2,
4761                                          (errmsg_internal("recovery_target_timeline = latest")));
4762                 }
4763                 else if (strcmp(item->name, "recovery_target_xid") == 0)
4764                 {
4765                         errno = 0;
4766                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
4767                         if (errno == EINVAL || errno == ERANGE)
4768                                 ereport(FATAL,
4769                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4770                                                  item->value)));
4771                         ereport(DEBUG2,
4772                                         (errmsg_internal("recovery_target_xid = %u",
4773                                                                          recoveryTargetXid)));
4774                         recoveryTarget = RECOVERY_TARGET_XID;
4775                 }
4776                 else if (strcmp(item->name, "recovery_target_time") == 0)
4777                 {
4778                         recoveryTarget = RECOVERY_TARGET_TIME;
4779
4780                         /*
4781                          * Convert the time string given by the user to TimestampTz form.
4782                          */
4783                         recoveryTargetTime =
4784                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4785                                                                                                 CStringGetDatum(item->value),
4786                                                                                                 ObjectIdGetDatum(InvalidOid),
4787                                                                                                                 Int32GetDatum(-1)));
4788                         ereport(DEBUG2,
4789                                         (errmsg_internal("recovery_target_time = '%s'",
4790                                                                    timestamptz_to_str(recoveryTargetTime))));
4791                 }
4792                 else if (strcmp(item->name, "recovery_target_name") == 0)
4793                 {
4794                         recoveryTarget = RECOVERY_TARGET_NAME;
4795
4796                         recoveryTargetName = pstrdup(item->value);
4797                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
4798                                 ereport(FATAL,
4799                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4800                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
4801                                                                 MAXFNAMELEN - 1)));
4802
4803                         ereport(DEBUG2,
4804                                         (errmsg_internal("recovery_target_name = '%s'",
4805                                                                          recoveryTargetName)));
4806                 }
4807                 else if (strcmp(item->name, "recovery_target") == 0)
4808                 {
4809                         if (strcmp(item->value, "immediate") == 0)
4810                                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4811                         else
4812                                 ereport(ERROR,
4813                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4814                                                  errmsg("invalid value for recovery parameter \"recovery_target\""),
4815                                                  errhint("The only allowed value is \"immediate\".")));
4816                         ereport(DEBUG2,
4817                                         (errmsg_internal("recovery_target = '%s'",
4818                                                                          item->value)));
4819                 }
4820                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
4821                 {
4822                         /*
4823                          * does nothing if a recovery_target is not also set
4824                          */
4825                         if (!parse_bool(item->value, &recoveryTargetInclusive))
4826                                 ereport(ERROR,
4827                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4828                                                  errmsg("parameter \"%s\" requires a Boolean value",
4829                                                                 "recovery_target_inclusive")));
4830                         ereport(DEBUG2,
4831                                         (errmsg_internal("recovery_target_inclusive = %s",
4832                                                                          item->value)));
4833                 }
4834                 else if (strcmp(item->name, "standby_mode") == 0)
4835                 {
4836                         if (!parse_bool(item->value, &StandbyModeRequested))
4837                                 ereport(ERROR,
4838                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4839                                                  errmsg("parameter \"%s\" requires a Boolean value",
4840                                                                 "standby_mode")));
4841                         ereport(DEBUG2,
4842                                         (errmsg_internal("standby_mode = '%s'", item->value)));
4843                 }
4844                 else if (strcmp(item->name, "primary_conninfo") == 0)
4845                 {
4846                         PrimaryConnInfo = pstrdup(item->value);
4847                         ereport(DEBUG2,
4848                                         (errmsg_internal("primary_conninfo = '%s'",
4849                                                                          PrimaryConnInfo)));
4850                 }
4851                 else if (strcmp(item->name, "primary_slot_name") == 0)
4852                 {
4853                         ReplicationSlotValidateName(item->value, ERROR);
4854                         PrimarySlotName = pstrdup(item->value);
4855                         ereport(DEBUG2,
4856                                         (errmsg_internal("primary_slot_name = '%s'",
4857                                                                          PrimarySlotName)));
4858                 }
4859                 else if (strcmp(item->name, "trigger_file") == 0)
4860                 {
4861                         TriggerFile = pstrdup(item->value);
4862                         ereport(DEBUG2,
4863                                         (errmsg_internal("trigger_file = '%s'",
4864                                                                          TriggerFile)));
4865                 }
4866                 else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
4867                 {
4868                         const char *hintmsg;
4869
4870                         if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
4871                                                    &hintmsg))
4872                                 ereport(ERROR,
4873                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4874                                                  errmsg("parameter \"%s\" requires a temporal value",
4875                                                                 "recovery_min_apply_delay"),
4876                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
4877                         ereport(DEBUG2,
4878                                         (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
4879                 }
4880                 else
4881                         ereport(FATAL,
4882                                         (errmsg("unrecognized recovery parameter \"%s\"",
4883                                                         item->name)));
4884         }
4885
4886         /*
4887          * Check for compulsory parameters
4888          */
4889         if (StandbyModeRequested)
4890         {
4891                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
4892                         ereport(WARNING,
4893                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
4894                                                         RECOVERY_COMMAND_FILE),
4895                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
4896         }
4897         else
4898         {
4899                 if (recoveryRestoreCommand == NULL)
4900                         ereport(FATAL,
4901                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
4902                                                         RECOVERY_COMMAND_FILE)));
4903         }
4904
4905         /*
4906          * Check for mutually exclusive parameters
4907          */
4908         if (recoveryPauseAtTargetSet && recoveryTargetActionSet)
4909                 ereport(ERROR,
4910                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4911                                  errmsg("cannot set both \"%s\" and \"%s\" recovery parameters",
4912                                                 "pause_at_recovery_target",
4913                                                 "recovery_target_action"),
4914                                  errhint("The \"pause_at_recovery_target\" is deprecated.")));
4915
4916
4917         /*
4918          * Override any inconsistent requests. Not that this is a change
4919          * of behaviour in 9.5; prior to this we simply ignored a request
4920          * to pause if hot_standby = off, which was surprising behaviour.
4921          */
4922         if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
4923                 recoveryTargetActionSet &&
4924                 standbyState == STANDBY_DISABLED)
4925                         recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
4926
4927         /* Enable fetching from archive recovery area */
4928         ArchiveRecoveryRequested = true;
4929
4930         /*
4931          * If user specified recovery_target_timeline, validate it or compute the
4932          * "latest" value.  We can't do this until after we've gotten the restore
4933          * command and set InArchiveRecovery, because we need to fetch timeline
4934          * history files from the archive.
4935          */
4936         if (rtliGiven)
4937         {
4938                 if (rtli)
4939                 {
4940                         /* Timeline 1 does not have a history file, all else should */
4941                         if (rtli != 1 && !existsTimeLineHistory(rtli))
4942                                 ereport(FATAL,
4943                                                 (errmsg("recovery target timeline %u does not exist",
4944                                                                 rtli)));
4945                         recoveryTargetTLI = rtli;
4946                         recoveryTargetIsLatest = false;
4947                 }
4948                 else
4949                 {
4950                         /* We start the "latest" search from pg_control's timeline */
4951                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
4952                         recoveryTargetIsLatest = true;
4953                 }
4954         }
4955
4956         FreeConfigVariables(head);
4957 }
4958
4959 /*
4960  * Exit archive-recovery state
4961  */
4962 static void
4963 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
4964 {
4965         char            recoveryPath[MAXPGPATH];
4966         char            xlogfname[MAXFNAMELEN];
4967
4968         /*
4969          * We are no longer in archive recovery state.
4970          */
4971         InArchiveRecovery = false;
4972
4973         /*
4974          * Update min recovery point one last time.
4975          */
4976         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
4977
4978         /*
4979          * If the ending log segment is still open, close it (to avoid problems on
4980          * Windows with trying to rename or delete an open file).
4981          */
4982         if (readFile >= 0)
4983         {
4984                 close(readFile);
4985                 readFile = -1;
4986         }
4987
4988         /*
4989          * If we are establishing a new timeline, we have to copy data from the
4990          * last WAL segment of the old timeline to create a starting WAL segment
4991          * for the new timeline.
4992          *
4993          * Notify the archiver that the last WAL segment of the old timeline is
4994          * ready to copy to archival storage if its .done file doesn't exist
4995          * (e.g., if it's the restored WAL file, it's expected to have .done file).
4996          * Otherwise, it is not archived for a while.
4997          */
4998         if (endTLI != ThisTimeLineID)
4999         {
5000                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
5001
5002                 /* Create .ready file only when neither .ready nor .done files exist */
5003                 if (XLogArchivingActive())
5004                 {
5005                         XLogFileName(xlogfname, endTLI, endLogSegNo);
5006                         XLogArchiveCheckDone(xlogfname);
5007                 }
5008         }
5009
5010         /*
5011          * Let's just make real sure there are not .ready or .done flags posted
5012          * for the new segment.
5013          */
5014         XLogFileName(xlogfname, ThisTimeLineID, endLogSegNo);
5015         XLogArchiveCleanup(xlogfname);
5016
5017         /*
5018          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5019          * of it.
5020          */
5021         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5022         unlink(recoveryPath);           /* ignore any error */
5023
5024         /* Get rid of any remaining recovered timeline-history file, too */
5025         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5026         unlink(recoveryPath);           /* ignore any error */
5027
5028         /*
5029          * Rename the config file out of the way, so that we don't accidentally
5030          * re-enter archive recovery mode in a subsequent crash.
5031          */
5032         unlink(RECOVERY_COMMAND_DONE);
5033         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5034                 ereport(FATAL,
5035                                 (errcode_for_file_access(),
5036                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5037                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5038
5039         ereport(LOG,
5040                         (errmsg("archive recovery complete")));
5041 }
5042
5043 /*
5044  * Extract timestamp from WAL record.
5045  *
5046  * If the record contains a timestamp, returns true, and saves the timestamp
5047  * in *recordXtime. If the record type has no timestamp, returns false.
5048  * Currently, only transaction commit/abort records and restore points contain
5049  * timestamps.
5050  */
5051 static bool
5052 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5053 {
5054         uint8           record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5055         uint8           rmid = XLogRecGetRmid(record);
5056
5057         if (rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5058         {
5059                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5060                 return true;
5061         }
5062         if (rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5063         {
5064                 *recordXtime = ((xl_xact_commit_compact *) XLogRecGetData(record))->xact_time;
5065                 return true;
5066         }
5067         if (rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5068         {
5069                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5070                 return true;
5071         }
5072         if (rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_PREPARED)
5073         {
5074                 *recordXtime = ((xl_xact_commit_prepared *) XLogRecGetData(record))->crec.xact_time;
5075                 return true;
5076         }
5077         if (rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5078         {
5079                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5080                 return true;
5081         }
5082         if (rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT_PREPARED)
5083         {
5084                 *recordXtime = ((xl_xact_abort_prepared *) XLogRecGetData(record))->arec.xact_time;
5085                 return true;
5086         }
5087         return false;
5088 }
5089
5090 /*
5091  * For point-in-time recovery, this function decides whether we want to
5092  * stop applying the XLOG before the current record.
5093  *
5094  * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5095  * information is saved in recoveryStopXid et al for use in annotating the
5096  * new timeline's history file.
5097  */
5098 static bool
5099 recoveryStopsBefore(XLogReaderState *record)
5100 {
5101         bool            stopsHere = false;
5102         uint8           record_info;
5103         bool            isCommit;
5104         TimestampTz recordXtime = 0;
5105         TransactionId recordXid;
5106
5107         /* Check if we should stop as soon as reaching consistency */
5108         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5109         {
5110                 ereport(LOG,
5111                                 (errmsg("recovery stopping after reaching consistency")));
5112
5113                 recoveryStopAfter = false;
5114                 recoveryStopXid = InvalidTransactionId;
5115                 recoveryStopTime = 0;
5116                 recoveryStopName[0] = '\0';
5117                 return true;
5118         }
5119
5120         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5121         if (XLogRecGetRmid(record) != RM_XACT_ID)
5122                 return false;
5123         record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5124
5125         if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5126         {
5127                 isCommit = true;
5128                 recordXid = XLogRecGetXid(record);
5129         }
5130         else if (record_info == XLOG_XACT_COMMIT_PREPARED)
5131         {
5132                 isCommit = true;
5133                 recordXid = ((xl_xact_commit_prepared *) XLogRecGetData(record))->xid;
5134         }
5135         else if (record_info == XLOG_XACT_ABORT)
5136         {
5137                 isCommit = false;
5138                 recordXid = XLogRecGetXid(record);
5139         }
5140         else if (record_info == XLOG_XACT_ABORT_PREPARED)
5141         {
5142                 isCommit = false;
5143                 recordXid = ((xl_xact_abort_prepared *) XLogRecGetData(record))->xid;
5144         }
5145         else
5146                 return false;
5147
5148         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5149         {
5150                 /*
5151                  * There can be only one transaction end record with this exact
5152                  * transactionid
5153                  *
5154                  * when testing for an xid, we MUST test for equality only, since
5155                  * transactions are numbered in the order they start, not the order
5156                  * they complete. A higher numbered xid will complete before you about
5157                  * 50% of the time...
5158                  */
5159                 stopsHere = (recordXid == recoveryTargetXid);
5160         }
5161
5162         if (recoveryTarget == RECOVERY_TARGET_TIME &&
5163                 getRecordTimestamp(record, &recordXtime))
5164         {
5165                 /*
5166                  * There can be many transactions that share the same commit time, so
5167                  * we stop after the last one, if we are inclusive, or stop at the
5168                  * first one if we are exclusive
5169                  */
5170                 if (recoveryTargetInclusive)
5171                         stopsHere = (recordXtime > recoveryTargetTime);
5172                 else
5173                         stopsHere = (recordXtime >= recoveryTargetTime);
5174         }
5175
5176         if (stopsHere)
5177         {
5178                 recoveryStopAfter = false;
5179                 recoveryStopXid = recordXid;
5180                 recoveryStopTime = recordXtime;
5181                 recoveryStopName[0] = '\0';
5182
5183                 if (isCommit)
5184                 {
5185                         ereport(LOG,
5186                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
5187                                                         recoveryStopXid,
5188                                                         timestamptz_to_str(recoveryStopTime))));
5189                 }
5190                 else
5191                 {
5192                         ereport(LOG,
5193                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
5194                                                         recoveryStopXid,
5195                                                         timestamptz_to_str(recoveryStopTime))));
5196                 }
5197         }
5198
5199         return stopsHere;
5200 }
5201
5202 /*
5203  * Same as recoveryStopsBefore, but called after applying the record.
5204  *
5205  * We also track the timestamp of the latest applied COMMIT/ABORT
5206  * record in XLogCtl->recoveryLastXTime.
5207  */
5208 static bool
5209 recoveryStopsAfter(XLogReaderState *record)
5210 {
5211         uint8           record_info;
5212         uint8           rmid;
5213         TimestampTz recordXtime;
5214
5215         record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5216         rmid = XLogRecGetRmid(record);
5217
5218         /*
5219          * There can be many restore points that share the same name; we stop at
5220          * the first one.
5221          */
5222         if (recoveryTarget == RECOVERY_TARGET_NAME &&
5223                 rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5224         {
5225                 xl_restore_point *recordRestorePointData;
5226
5227                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5228
5229                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5230                 {
5231                         recoveryStopAfter = true;
5232                         recoveryStopXid = InvalidTransactionId;
5233                         (void) getRecordTimestamp(record, &recoveryStopTime);
5234                         strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5235
5236                         ereport(LOG,
5237                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5238                                                 recoveryStopName,
5239                                                 timestamptz_to_str(recoveryStopTime))));
5240                         return true;
5241                 }
5242         }
5243
5244         if (rmid == RM_XACT_ID &&
5245                 (record_info == XLOG_XACT_COMMIT_COMPACT ||
5246                  record_info == XLOG_XACT_COMMIT ||
5247                  record_info == XLOG_XACT_COMMIT_PREPARED ||
5248                  record_info == XLOG_XACT_ABORT ||
5249                  record_info == XLOG_XACT_ABORT_PREPARED))
5250         {
5251                 TransactionId recordXid;
5252
5253                 /* Update the last applied transaction timestamp */
5254                 if (getRecordTimestamp(record, &recordXtime))
5255                         SetLatestXTime(recordXtime);
5256
5257                 /* Extract the XID of the committed/aborted transaction */
5258                 if (record_info == XLOG_XACT_COMMIT_PREPARED)
5259                         recordXid = ((xl_xact_commit_prepared *) XLogRecGetData(record))->xid;
5260                 else if (record_info == XLOG_XACT_ABORT_PREPARED)
5261                         recordXid = ((xl_xact_abort_prepared *) XLogRecGetData(record))->xid;
5262                 else
5263                         recordXid = XLogRecGetXid(record);
5264
5265                 /*
5266                  * There can be only one transaction end record with this exact
5267                  * transactionid
5268                  *
5269                  * when testing for an xid, we MUST test for equality only, since
5270                  * transactions are numbered in the order they start, not the order
5271                  * they complete. A higher numbered xid will complete before you about
5272                  * 50% of the time...
5273                  */
5274                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5275                         recordXid == recoveryTargetXid)
5276                 {
5277                         recoveryStopAfter = true;
5278                         recoveryStopXid = recordXid;
5279                         recoveryStopTime = recordXtime;
5280                         recoveryStopName[0] = '\0';
5281
5282                         if (record_info == XLOG_XACT_COMMIT_COMPACT ||
5283                                 record_info == XLOG_XACT_COMMIT ||
5284                                 record_info == XLOG_XACT_COMMIT_PREPARED)
5285                         {
5286                                 ereport(LOG,
5287                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5288                                                                 recoveryStopXid,
5289                                                                 timestamptz_to_str(recoveryStopTime))));
5290                         }
5291                         else if (record_info == XLOG_XACT_ABORT ||
5292                                          record_info == XLOG_XACT_ABORT_PREPARED)
5293                         {
5294                                 ereport(LOG,
5295                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5296                                                                 recoveryStopXid,
5297                                                                 timestamptz_to_str(recoveryStopTime))));
5298                         }
5299                         return true;
5300                 }
5301         }
5302
5303         /* Check if we should stop as soon as reaching consistency */
5304         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5305         {
5306                 ereport(LOG,
5307                                 (errmsg("recovery stopping after reaching consistency")));
5308
5309                 recoveryStopAfter = true;
5310                 recoveryStopXid = InvalidTransactionId;
5311                 recoveryStopTime = 0;
5312                 recoveryStopName[0] = '\0';
5313                 return true;
5314         }
5315
5316         return false;
5317 }
5318
5319 /*
5320  * Wait until shared recoveryPause flag is cleared.
5321  *
5322  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5323  * Probably not worth the trouble though.  This state shouldn't be one that
5324  * anyone cares about server power consumption in.
5325  */
5326 static void
5327 recoveryPausesHere(void)
5328 {
5329         /* Don't pause unless users can connect! */
5330         if (!LocalHotStandbyActive)
5331                 return;
5332
5333         ereport(LOG,
5334                         (errmsg("recovery has paused"),
5335                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5336
5337         while (RecoveryIsPaused())
5338         {
5339                 pg_usleep(1000000L);    /* 1000 ms */
5340                 HandleStartupProcInterrupts();
5341         }
5342 }
5343
5344 bool
5345 RecoveryIsPaused(void)
5346 {
5347         bool            recoveryPause;
5348
5349         SpinLockAcquire(&XLogCtl->info_lck);
5350         recoveryPause = XLogCtl->recoveryPause;
5351         SpinLockRelease(&XLogCtl->info_lck);
5352
5353         return recoveryPause;
5354 }
5355
5356 void
5357 SetRecoveryPause(bool recoveryPause)
5358 {
5359         SpinLockAcquire(&XLogCtl->info_lck);
5360         XLogCtl->recoveryPause = recoveryPause;
5361         SpinLockRelease(&XLogCtl->info_lck);
5362 }
5363
5364 /*
5365  * When recovery_min_apply_delay is set, we wait long enough to make sure
5366  * certain record types are applied at least that interval behind the master.
5367  *
5368  * Returns true if we waited.
5369  *
5370  * Note that the delay is calculated between the WAL record log time and
5371  * the current time on standby. We would prefer to keep track of when this
5372  * standby received each WAL record, which would allow a more consistent
5373  * approach and one not affected by time synchronisation issues, but that
5374  * is significantly more effort and complexity for little actual gain in
5375  * usability.
5376  */
5377 static bool
5378 recoveryApplyDelay(XLogReaderState *record)
5379 {
5380         uint8           record_info;
5381         TimestampTz xtime;
5382         long            secs;
5383         int                     microsecs;
5384
5385         /* nothing to do if no delay configured */
5386         if (recovery_min_apply_delay == 0)
5387                 return false;
5388
5389         /*
5390          * Is it a COMMIT record?
5391          *
5392          * We deliberately choose not to delay aborts since they have no effect on
5393          * MVCC. We already allow replay of records that don't have a timestamp,
5394          * so there is already opportunity for issues caused by early conflicts on
5395          * standbys.
5396          */
5397         record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5398         if (!(XLogRecGetRmid(record) == RM_XACT_ID &&
5399                   (record_info == XLOG_XACT_COMMIT_COMPACT ||
5400                    record_info == XLOG_XACT_COMMIT ||
5401                    record_info == XLOG_XACT_COMMIT_PREPARED)))
5402                 return false;
5403
5404         if (!getRecordTimestamp(record, &xtime))
5405                 return false;
5406
5407         recoveryDelayUntilTime =
5408                 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
5409
5410         /*
5411          * Exit without arming the latch if it's already past time to apply this
5412          * record
5413          */
5414         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
5415                                                 &secs, &microsecs);
5416         if (secs <= 0 && microsecs <= 0)
5417                 return false;
5418
5419         while (true)
5420         {
5421                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
5422
5423                 /* might change the trigger file's location */
5424                 HandleStartupProcInterrupts();
5425
5426                 if (CheckForStandbyTrigger())
5427                         break;
5428
5429                 /*
5430                  * Wait for difference between GetCurrentTimestamp() and
5431                  * recoveryDelayUntilTime
5432                  */
5433                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
5434                                                         &secs, &microsecs);
5435
5436                 if (secs <= 0 && microsecs <= 0)
5437                         break;
5438
5439                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
5440                          secs, microsecs / 1000);
5441
5442                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
5443                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
5444                                   secs * 1000L + microsecs / 1000);
5445         }
5446         return true;
5447 }
5448
5449 /*
5450  * Save timestamp of latest processed commit/abort record.
5451  *
5452  * We keep this in XLogCtl, not a simple static variable, so that it can be
5453  * seen by processes other than the startup process.  Note in particular
5454  * that CreateRestartPoint is executed in the checkpointer.
5455  */
5456 static void
5457 SetLatestXTime(TimestampTz xtime)
5458 {
5459         SpinLockAcquire(&XLogCtl->info_lck);
5460         XLogCtl->recoveryLastXTime = xtime;
5461         SpinLockRelease(&XLogCtl->info_lck);
5462 }
5463
5464 /*
5465  * Fetch timestamp of latest processed commit/abort record.
5466  */
5467 TimestampTz
5468 GetLatestXTime(void)
5469 {
5470         TimestampTz xtime;
5471
5472         SpinLockAcquire(&XLogCtl->info_lck);
5473         xtime = XLogCtl->recoveryLastXTime;
5474         SpinLockRelease(&XLogCtl->info_lck);
5475
5476         return xtime;
5477 }
5478
5479 /*
5480  * Save timestamp of the next chunk of WAL records to apply.
5481  *
5482  * We keep this in XLogCtl, not a simple static variable, so that it can be
5483  * seen by all backends.
5484  */
5485 static void
5486 SetCurrentChunkStartTime(TimestampTz xtime)
5487 {
5488         SpinLockAcquire(&XLogCtl->info_lck);
5489         XLogCtl->currentChunkStartTime = xtime;
5490         SpinLockRelease(&XLogCtl->info_lck);
5491 }
5492
5493 /*
5494  * Fetch timestamp of latest processed commit/abort record.
5495  * Startup process maintains an accurate local copy in XLogReceiptTime
5496  */
5497 TimestampTz
5498 GetCurrentChunkReplayStartTime(void)
5499 {
5500         TimestampTz xtime;
5501
5502         SpinLockAcquire(&XLogCtl->info_lck);
5503         xtime = XLogCtl->currentChunkStartTime;
5504         SpinLockRelease(&XLogCtl->info_lck);
5505
5506         return xtime;
5507 }
5508
5509 /*
5510  * Returns time of receipt of current chunk of XLOG data, as well as
5511  * whether it was received from streaming replication or from archives.
5512  */
5513 void
5514 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
5515 {
5516         /*
5517          * This must be executed in the startup process, since we don't export the
5518          * relevant state to shared memory.
5519          */
5520         Assert(InRecovery);
5521
5522         *rtime = XLogReceiptTime;
5523         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
5524 }
5525
5526 /*
5527  * Note that text field supplied is a parameter name and does not require
5528  * translation
5529  */
5530 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5531 do { \
5532         if ((currValue) < (minValue)) \
5533                 ereport(ERROR, \
5534                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5535                                  errmsg("hot standby is not possible because " \
5536                                                 "%s = %d is a lower setting than on the master server " \
5537                                                 "(its value was %d)", \
5538                                                 param_name, \
5539                                                 currValue, \
5540                                                 minValue))); \
5541 } while(0)
5542
5543 /*
5544  * Check to see if required parameters are set high enough on this server
5545  * for various aspects of recovery operation.
5546  */
5547 static void
5548 CheckRequiredParameterValues(void)
5549 {
5550         /*
5551          * For archive recovery, the WAL must be generated with at least 'archive'
5552          * wal_level.
5553          */
5554         if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5555         {
5556                 ereport(WARNING,
5557                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
5558                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5559         }
5560
5561         /*
5562          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
5563          * we must have at least as many backend slots as the primary.
5564          */
5565         if (ArchiveRecoveryRequested && EnableHotStandby)
5566         {
5567                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
5568                         ereport(ERROR,
5569                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on the master server"),
5570                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5571
5572                 /* We ignore autovacuum_max_workers when we make this test. */
5573                 RecoveryRequiresIntParameter("max_connections",
5574                                                                          MaxConnections,
5575                                                                          ControlFile->MaxConnections);
5576                 RecoveryRequiresIntParameter("max_worker_processes",
5577                                                                          max_worker_processes,
5578                                                                          ControlFile->max_worker_processes);
5579                 RecoveryRequiresIntParameter("max_prepared_transactions",
5580                                                                          max_prepared_xacts,
5581                                                                          ControlFile->max_prepared_xacts);
5582                 RecoveryRequiresIntParameter("max_locks_per_transaction",
5583                                                                          max_locks_per_xact,
5584                                                                          ControlFile->max_locks_per_xact);
5585         }
5586 }
5587
5588 /*
5589  * This must be called ONCE during postmaster or standalone-backend startup
5590  */
5591 void
5592 StartupXLOG(void)
5593 {
5594         XLogCtlInsert *Insert;
5595         CheckPoint      checkPoint;
5596         bool            wasShutdown;
5597         bool            reachedStopPoint = false;
5598         bool            haveBackupLabel = false;
5599         XLogRecPtr      RecPtr,
5600                                 checkPointLoc,
5601                                 EndOfLog;
5602         XLogSegNo       endLogSegNo;
5603         TimeLineID      PrevTimeLineID;
5604         XLogRecord *record;
5605         TransactionId oldestActiveXID;
5606         bool            backupEndRequired = false;
5607         bool            backupFromStandby = false;
5608         DBState         dbstate_at_startup;
5609         XLogReaderState *xlogreader;
5610         XLogPageReadPrivate private;
5611         bool            fast_promoted = false;
5612
5613         /*
5614          * Read control file and check XLOG status looks valid.
5615          *
5616          * Note: in most control paths, *ControlFile is already valid and we need
5617          * not do ReadControlFile() here, but might as well do it to be sure.
5618          */
5619         ReadControlFile();
5620
5621         if (ControlFile->state < DB_SHUTDOWNED ||
5622                 ControlFile->state > DB_IN_PRODUCTION ||
5623                 !XRecOffIsValid(ControlFile->checkPoint))
5624                 ereport(FATAL,
5625                                 (errmsg("control file contains invalid data")));
5626
5627         if (ControlFile->state == DB_SHUTDOWNED)
5628         {
5629                 /* This is the expected case, so don't be chatty in standalone mode */
5630                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
5631                                 (errmsg("database system was shut down at %s",
5632                                                 str_time(ControlFile->time))));
5633         }
5634         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
5635                 ereport(LOG,
5636                                 (errmsg("database system was shut down in recovery at %s",
5637                                                 str_time(ControlFile->time))));
5638         else if (ControlFile->state == DB_SHUTDOWNING)
5639                 ereport(LOG,
5640                                 (errmsg("database system shutdown was interrupted; last known up at %s",
5641                                                 str_time(ControlFile->time))));
5642         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5643                 ereport(LOG,
5644                    (errmsg("database system was interrupted while in recovery at %s",
5645                                    str_time(ControlFile->time)),
5646                         errhint("This probably means that some data is corrupted and"
5647                                         " you will have to use the last backup for recovery.")));
5648         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
5649                 ereport(LOG,
5650                                 (errmsg("database system was interrupted while in recovery at log time %s",
5651                                                 str_time(ControlFile->checkPointCopy.time)),
5652                                  errhint("If this has occurred more than once some data might be corrupted"
5653                           " and you might need to choose an earlier recovery target.")));
5654         else if (ControlFile->state == DB_IN_PRODUCTION)
5655                 ereport(LOG,
5656                           (errmsg("database system was interrupted; last known up at %s",
5657                                           str_time(ControlFile->time))));
5658
5659         /* This is just to allow attaching to startup process with a debugger */
5660 #ifdef XLOG_REPLAY_DELAY
5661         if (ControlFile->state != DB_SHUTDOWNED)
5662                 pg_usleep(60000000L);
5663 #endif
5664
5665         /*
5666          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
5667          * someone has performed a copy for PITR, these directories may have been
5668          * excluded and need to be re-created.
5669          */
5670         ValidateXLOGDirectoryStructure();
5671
5672         /*
5673          * Clear out any old relcache cache files.  This is *necessary* if we do
5674          * any WAL replay, since that would probably result in the cache files
5675          * being out of sync with database reality.  In theory we could leave them
5676          * in place if the database had been cleanly shut down, but it seems
5677          * safest to just remove them always and let them be rebuilt during the
5678          * first backend startup.
5679          */
5680         RelationCacheInitFileRemove();
5681
5682         /*
5683          * Initialize on the assumption we want to recover to the latest timeline
5684          * that's active according to pg_control.
5685          */
5686         if (ControlFile->minRecoveryPointTLI >
5687                 ControlFile->checkPointCopy.ThisTimeLineID)
5688                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
5689         else
5690                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
5691
5692         /*
5693          * Check for recovery control file, and if so set up state for offline
5694          * recovery
5695          */
5696         readRecoveryCommandFile();
5697
5698         /*
5699          * Save archive_cleanup_command in shared memory so that other processes
5700          * can see it.
5701          */
5702         strlcpy(XLogCtl->archiveCleanupCommand,
5703                         archiveCleanupCommand ? archiveCleanupCommand : "",
5704                         sizeof(XLogCtl->archiveCleanupCommand));
5705
5706         if (ArchiveRecoveryRequested)
5707         {
5708                 if (StandbyModeRequested)
5709                         ereport(LOG,
5710                                         (errmsg("entering standby mode")));
5711                 else if (recoveryTarget == RECOVERY_TARGET_XID)
5712                         ereport(LOG,
5713                                         (errmsg("starting point-in-time recovery to XID %u",
5714                                                         recoveryTargetXid)));
5715                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
5716                         ereport(LOG,
5717                                         (errmsg("starting point-in-time recovery to %s",
5718                                                         timestamptz_to_str(recoveryTargetTime))));
5719                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
5720                         ereport(LOG,
5721                                         (errmsg("starting point-in-time recovery to \"%s\"",
5722                                                         recoveryTargetName)));
5723                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
5724                         ereport(LOG,
5725                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
5726                 else
5727                         ereport(LOG,
5728                                         (errmsg("starting archive recovery")));
5729         }
5730
5731         /*
5732          * Take ownership of the wakeup latch if we're going to sleep during
5733          * recovery.
5734          */
5735         if (StandbyModeRequested)
5736                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
5737
5738         /* Set up XLOG reader facility */
5739         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
5740         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
5741         if (!xlogreader)
5742                 ereport(ERROR,
5743                                 (errcode(ERRCODE_OUT_OF_MEMORY),
5744                                  errmsg("out of memory"),
5745                    errdetail("Failed while allocating an XLog reading processor.")));
5746         xlogreader->system_identifier = ControlFile->system_identifier;
5747
5748         if (read_backup_label(&checkPointLoc, &backupEndRequired,
5749                                                   &backupFromStandby))
5750         {
5751                 /*
5752                  * Archive recovery was requested, and thanks to the backup label
5753                  * file, we know how far we need to replay to reach consistency. Enter
5754                  * archive recovery directly.
5755                  */
5756                 InArchiveRecovery = true;
5757                 if (StandbyModeRequested)
5758                         StandbyMode = true;
5759
5760                 /*
5761                  * When a backup_label file is present, we want to roll forward from
5762                  * the checkpoint it identifies, rather than using pg_control.
5763                  */
5764                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
5765                 if (record != NULL)
5766                 {
5767                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
5768                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5769                         ereport(DEBUG1,
5770                                         (errmsg("checkpoint record is at %X/%X",
5771                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5772                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
5773
5774                         /*
5775                          * Make sure that REDO location exists. This may not be the case
5776                          * if there was a crash during an online backup, which left a
5777                          * backup_label around that references a WAL segment that's
5778                          * already been archived.
5779                          */
5780                         if (checkPoint.redo < checkPointLoc)
5781                         {
5782                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
5783                                         ereport(FATAL,
5784                                                         (errmsg("could not find redo location referenced by checkpoint record"),
5785                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5786                         }
5787                 }
5788                 else
5789                 {
5790                         ereport(FATAL,
5791                                         (errmsg("could not locate required checkpoint record"),
5792                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5793                         wasShutdown = false;    /* keep compiler quiet */
5794                 }
5795                 /* set flag to delete it later */
5796                 haveBackupLabel = true;
5797         }
5798         else
5799         {
5800                 /*
5801                  * It's possible that archive recovery was requested, but we don't
5802                  * know how far we need to replay the WAL before we reach consistency.
5803                  * This can happen for example if a base backup is taken from a
5804                  * running server using an atomic filesystem snapshot, without calling
5805                  * pg_start/stop_backup. Or if you just kill a running master server
5806                  * and put it into archive recovery by creating a recovery.conf file.
5807                  *
5808                  * Our strategy in that case is to perform crash recovery first,
5809                  * replaying all the WAL present in pg_xlog, and only enter archive
5810                  * recovery after that.
5811                  *
5812                  * But usually we already know how far we need to replay the WAL (up
5813                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
5814                  * end-of-backup record), and we can enter archive recovery directly.
5815                  */
5816                 if (ArchiveRecoveryRequested &&
5817                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
5818                          ControlFile->backupEndRequired ||
5819                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
5820                          ControlFile->state == DB_SHUTDOWNED))
5821                 {
5822                         InArchiveRecovery = true;
5823                         if (StandbyModeRequested)
5824                                 StandbyMode = true;
5825                 }
5826
5827                 /*
5828                  * Get the last valid checkpoint record.  If the latest one according
5829                  * to pg_control is broken, try the next-to-last one.
5830                  */
5831                 checkPointLoc = ControlFile->checkPoint;
5832                 RedoStartLSN = ControlFile->checkPointCopy.redo;
5833                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
5834                 if (record != NULL)
5835                 {
5836                         ereport(DEBUG1,
5837                                         (errmsg("checkpoint record is at %X/%X",
5838                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5839                 }
5840                 else if (StandbyMode)
5841                 {
5842                         /*
5843                          * The last valid checkpoint record required for a streaming
5844                          * recovery exists in neither standby nor the primary.
5845                          */
5846                         ereport(PANIC,
5847                                         (errmsg("could not locate a valid checkpoint record")));
5848                 }
5849                 else
5850                 {
5851                         checkPointLoc = ControlFile->prevCheckPoint;
5852                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
5853                         if (record != NULL)
5854                         {
5855                                 ereport(LOG,
5856                                                 (errmsg("using previous checkpoint record at %X/%X",
5857                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5858                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
5859                         }
5860                         else
5861                                 ereport(PANIC,
5862                                          (errmsg("could not locate a valid checkpoint record")));
5863                 }
5864                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
5865                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5866         }
5867
5868         /*
5869          * If the location of the checkpoint record is not on the expected
5870          * timeline in the history of the requested timeline, we cannot proceed:
5871          * the backup is not part of the history of the requested timeline.
5872          */
5873         Assert(expectedTLEs);           /* was initialized by reading checkpoint
5874                                                                  * record */
5875         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
5876                 checkPoint.ThisTimeLineID)
5877         {
5878                 XLogRecPtr      switchpoint;
5879
5880                 /*
5881                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
5882                  * not in expectedTLEs at all.
5883                  */
5884                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
5885                 ereport(FATAL,
5886                                 (errmsg("requested timeline %u is not a child of this server's history",
5887                                                 recoveryTargetTLI),
5888                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
5889                                                    (uint32) (ControlFile->checkPoint >> 32),
5890                                                    (uint32) ControlFile->checkPoint,
5891                                                    ControlFile->checkPointCopy.ThisTimeLineID,
5892                                                    (uint32) (switchpoint >> 32),
5893                                                    (uint32) switchpoint)));
5894         }
5895
5896         /*
5897          * The min recovery point should be part of the requested timeline's
5898          * history, too.
5899          */
5900         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
5901           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
5902                 ControlFile->minRecoveryPointTLI)
5903                 ereport(FATAL,
5904                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
5905                                                 recoveryTargetTLI,
5906                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
5907                                                 (uint32) ControlFile->minRecoveryPoint,
5908                                                 ControlFile->minRecoveryPointTLI)));
5909
5910         LastRec = RecPtr = checkPointLoc;
5911
5912         ereport(DEBUG1,
5913                         (errmsg("redo record is at %X/%X; shutdown %s",
5914                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
5915                                         wasShutdown ? "TRUE" : "FALSE")));
5916         ereport(DEBUG1,
5917                         (errmsg("next transaction ID: %u/%u; next OID: %u",
5918                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
5919                                         checkPoint.nextOid)));
5920         ereport(DEBUG1,
5921                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
5922                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5923         ereport(DEBUG1,
5924                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
5925                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
5926         ereport(DEBUG1,
5927                         (errmsg("oldest MultiXactId: %u, in database %u",
5928                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
5929         ereport(DEBUG1,
5930                         (errmsg("commit timestamp Xid oldest/newest: %u/%u",
5931                                         checkPoint.oldestCommitTs,
5932                                         checkPoint.newestCommitTs)));
5933         if (!TransactionIdIsNormal(checkPoint.nextXid))
5934                 ereport(PANIC,
5935                                 (errmsg("invalid next transaction ID")));
5936
5937         /* initialize shared memory variables from the checkpoint record */
5938         ShmemVariableCache->nextXid = checkPoint.nextXid;
5939         ShmemVariableCache->nextOid = checkPoint.nextOid;
5940         ShmemVariableCache->oidCount = 0;
5941         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5942         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5943         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
5944         SetCommitTsLimit(checkPoint.oldestCommitTs,
5945                                          checkPoint.newestCommitTs);
5946         MultiXactSetSafeTruncate(checkPoint.oldestMulti);
5947         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
5948         XLogCtl->ckptXid = checkPoint.nextXid;
5949
5950         /*
5951          * Initialize replication slots, before there's a chance to remove
5952          * required resources.
5953          */
5954         StartupReplicationSlots();
5955
5956         /*
5957          * Startup logical state, needs to be setup now so we have proper data
5958          * during crash recovery.
5959          */
5960         StartupReorderBuffer();
5961
5962         /*
5963          * Startup MultiXact.  We need to do this early for two reasons: one is
5964          * that we might try to access multixacts when we do tuple freezing, and
5965          * the other is we need its state initialized because we attempt
5966          * truncation during restartpoints.
5967          */
5968         StartupMultiXact();
5969
5970         /*
5971          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
5972          * control file. On recovery, all unlogged relations are blown away, so
5973          * the unlogged LSN counter can be reset too.
5974          */
5975         if (ControlFile->state == DB_SHUTDOWNED)
5976                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
5977         else
5978                 XLogCtl->unloggedLSN = 1;
5979
5980         /*
5981          * We must replay WAL entries using the same TimeLineID they were created
5982          * under, so temporarily adopt the TLI indicated by the checkpoint (see
5983          * also xlog_redo()).
5984          */
5985         ThisTimeLineID = checkPoint.ThisTimeLineID;
5986
5987         /*
5988          * Copy any missing timeline history files between 'now' and the recovery
5989          * target timeline from archive to pg_xlog. While we don't need those
5990          * files ourselves - the history file of the recovery target timeline
5991          * covers all the previous timelines in the history too - a cascading
5992          * standby server might be interested in them. Or, if you archive the WAL
5993          * from this server to a different archive than the master, it'd be good
5994          * for all the history files to get archived there after failover, so that
5995          * you can use one of the old timelines as a PITR target. Timeline history
5996          * files are small, so it's better to copy them unnecessarily than not
5997          * copy them and regret later.
5998          */
5999         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6000
6001         lastFullPageWrites = checkPoint.fullPageWrites;
6002
6003         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6004         doPageWrites = lastFullPageWrites;
6005
6006         if (RecPtr < checkPoint.redo)
6007                 ereport(PANIC,
6008                                 (errmsg("invalid redo in checkpoint record")));
6009
6010         /*
6011          * Check whether we need to force recovery from WAL.  If it appears to
6012          * have been a clean shutdown and we did not have a recovery.conf file,
6013          * then assume no recovery needed.
6014          */
6015         if (checkPoint.redo < RecPtr)
6016         {
6017                 if (wasShutdown)
6018                         ereport(PANIC,
6019                                         (errmsg("invalid redo record in shutdown checkpoint")));
6020                 InRecovery = true;
6021         }
6022         else if (ControlFile->state != DB_SHUTDOWNED)
6023                 InRecovery = true;
6024         else if (ArchiveRecoveryRequested)
6025         {
6026                 /* force recovery due to presence of recovery.conf */
6027                 InRecovery = true;
6028         }
6029
6030         /* REDO */
6031         if (InRecovery)
6032         {
6033                 int                     rmid;
6034
6035                 /*
6036                  * Update pg_control to show that we are recovering and to show the
6037                  * selected checkpoint as the place we are starting from. We also mark
6038                  * pg_control with any minimum recovery stop point obtained from a
6039                  * backup history file.
6040                  */
6041                 dbstate_at_startup = ControlFile->state;
6042                 if (InArchiveRecovery)
6043                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6044                 else
6045                 {
6046                         ereport(LOG,
6047                                         (errmsg("database system was not properly shut down; "
6048                                                         "automatic recovery in progress")));
6049                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6050                                 ereport(LOG,
6051                                                 (errmsg("crash recovery starts in timeline %u "
6052                                                                 "and has target timeline %u",
6053                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6054                                                                 recoveryTargetTLI)));
6055                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6056                 }
6057                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6058                 ControlFile->checkPoint = checkPointLoc;
6059                 ControlFile->checkPointCopy = checkPoint;
6060                 if (InArchiveRecovery)
6061                 {
6062                         /* initialize minRecoveryPoint if not set yet */
6063                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6064                         {
6065                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6066                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6067                         }
6068                 }
6069
6070                 /*
6071                  * Set backupStartPoint if we're starting recovery from a base backup.
6072                  *
6073                  * Set backupEndPoint and use minRecoveryPoint as the backup end
6074                  * location if we're starting recovery from a base backup which was
6075                  * taken from the standby. In this case, the database system status in
6076                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
6077                  * means that backup is corrupted, so we cancel recovery.
6078                  */
6079                 if (haveBackupLabel)
6080                 {
6081                         ControlFile->backupStartPoint = checkPoint.redo;
6082                         ControlFile->backupEndRequired = backupEndRequired;
6083
6084                         if (backupFromStandby)
6085                         {
6086                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6087                                         ereport(FATAL,
6088                                                         (errmsg("backup_label contains data inconsistent with control file"),
6089                                                          errhint("This means that the backup is corrupted and you will "
6090                                                            "have to use another backup for recovery.")));
6091                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6092                         }
6093                 }
6094                 ControlFile->time = (pg_time_t) time(NULL);
6095                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6096                 UpdateControlFile();
6097
6098                 /* initialize our local copy of minRecoveryPoint */
6099                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6100                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6101
6102                 /*
6103                  * Reset pgstat data, because it may be invalid after recovery.
6104                  */
6105                 pgstat_reset_all();
6106
6107                 /*
6108                  * If there was a backup label file, it's done its job and the info
6109                  * has now been propagated into pg_control.  We must get rid of the
6110                  * label file so that if we crash during recovery, we'll pick up at
6111                  * the latest recovery restartpoint instead of going all the way back
6112                  * to the backup start point.  It seems prudent though to just rename
6113                  * the file out of the way rather than delete it completely.
6114                  */
6115                 if (haveBackupLabel)
6116                 {
6117                         unlink(BACKUP_LABEL_OLD);
6118                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6119                                 ereport(FATAL,
6120                                                 (errcode_for_file_access(),
6121                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6122                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6123                 }
6124
6125                 /* Check that the GUCs used to generate the WAL allow recovery */
6126                 CheckRequiredParameterValues();
6127
6128                 /*
6129                  * We're in recovery, so unlogged relations may be trashed and must be
6130                  * reset.  This should be done BEFORE allowing Hot Standby
6131                  * connections, so that read-only backends don't try to read whatever
6132                  * garbage is left over from before.
6133                  */
6134                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6135
6136                 /*
6137                  * Likewise, delete any saved transaction snapshot files that got left
6138                  * behind by crashed backends.
6139                  */
6140                 DeleteAllExportedSnapshotFiles();
6141
6142                 /*
6143                  * Initialize for Hot Standby, if enabled. We won't let backends in
6144                  * yet, not until we've reached the min recovery point specified in
6145                  * control file and we've established a recovery snapshot from a
6146                  * running-xacts WAL record.
6147                  */
6148                 if (ArchiveRecoveryRequested && EnableHotStandby)
6149                 {
6150                         TransactionId *xids;
6151                         int                     nxids;
6152
6153                         ereport(DEBUG1,
6154                                         (errmsg("initializing for hot standby")));
6155
6156                         InitRecoveryTransactionEnvironment();
6157
6158                         if (wasShutdown)
6159                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6160                         else
6161                                 oldestActiveXID = checkPoint.oldestActiveXid;
6162                         Assert(TransactionIdIsValid(oldestActiveXID));
6163
6164                         /* Tell procarray about the range of xids it has to deal with */
6165                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6166
6167                         /*
6168                          * Startup commit log, commit timestamp and subtrans only.
6169                          * MultiXact has already been started up and other SLRUs are not
6170                          * maintained during recovery and need not be started yet.
6171                          */
6172                         StartupCLOG();
6173                         StartupCommitTs();
6174                         StartupSUBTRANS(oldestActiveXID);
6175
6176                         /*
6177                          * If we're beginning at a shutdown checkpoint, we know that
6178                          * nothing was running on the master at this point. So fake-up an
6179                          * empty running-xacts record and use that here and now. Recover
6180                          * additional standby state for prepared transactions.
6181                          */
6182                         if (wasShutdown)
6183                         {
6184                                 RunningTransactionsData running;
6185                                 TransactionId latestCompletedXid;
6186
6187                                 /*
6188                                  * Construct a RunningTransactions snapshot representing a
6189                                  * shut down server, with only prepared transactions still
6190                                  * alive. We're never overflowed at this point because all
6191                                  * subxids are listed with their parent prepared transactions.
6192                                  */
6193                                 running.xcnt = nxids;
6194                                 running.subxcnt = 0;
6195                                 running.subxid_overflow = false;
6196                                 running.nextXid = checkPoint.nextXid;
6197                                 running.oldestRunningXid = oldestActiveXID;
6198                                 latestCompletedXid = checkPoint.nextXid;
6199                                 TransactionIdRetreat(latestCompletedXid);
6200                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6201                                 running.latestCompletedXid = latestCompletedXid;
6202                                 running.xids = xids;
6203
6204                                 ProcArrayApplyRecoveryInfo(&running);
6205
6206                                 StandbyRecoverPreparedTransactions(false);
6207                         }
6208                 }
6209
6210                 /* Initialize resource managers */
6211                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6212                 {
6213                         if (RmgrTable[rmid].rm_startup != NULL)
6214                                 RmgrTable[rmid].rm_startup();
6215                 }
6216
6217                 /*
6218                  * Initialize shared variables for tracking progress of WAL replay, as
6219                  * if we had just replayed the record before the REDO location (or the
6220                  * checkpoint record itself, if it's a shutdown checkpoint).
6221                  */
6222                 SpinLockAcquire(&XLogCtl->info_lck);
6223                 if (checkPoint.redo < RecPtr)
6224                         XLogCtl->replayEndRecPtr = checkPoint.redo;
6225                 else
6226                         XLogCtl->replayEndRecPtr = EndRecPtr;
6227                 XLogCtl->replayEndTLI = ThisTimeLineID;
6228                 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
6229                 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
6230                 XLogCtl->recoveryLastXTime = 0;
6231                 XLogCtl->currentChunkStartTime = 0;
6232                 XLogCtl->recoveryPause = false;
6233                 SpinLockRelease(&XLogCtl->info_lck);
6234
6235                 /* Also ensure XLogReceiptTime has a sane value */
6236                 XLogReceiptTime = GetCurrentTimestamp();
6237
6238                 /*
6239                  * Let postmaster know we've started redo now, so that it can launch
6240                  * checkpointer to perform restartpoints.  We don't bother during
6241                  * crash recovery as restartpoints can only be performed during
6242                  * archive recovery.  And we'd like to keep crash recovery simple, to
6243                  * avoid introducing bugs that could affect you when recovering after
6244                  * crash.
6245                  *
6246                  * After this point, we can no longer assume that we're the only
6247                  * process in addition to postmaster!  Also, fsync requests are
6248                  * subsequently to be handled by the checkpointer, not locally.
6249                  */
6250                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6251                 {
6252                         PublishStartupProcessInformation();
6253                         SetForwardFsyncRequests();
6254                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6255                         bgwriterLaunched = true;
6256                 }
6257
6258                 /*
6259                  * Allow read-only connections immediately if we're consistent
6260                  * already.
6261                  */
6262                 CheckRecoveryConsistency();
6263
6264                 /*
6265                  * Find the first record that logically follows the checkpoint --- it
6266                  * might physically precede it, though.
6267                  */
6268                 if (checkPoint.redo < RecPtr)
6269                 {
6270                         /* back up to find the record */
6271                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6272                 }
6273                 else
6274                 {
6275                         /* just have to read next record after CheckPoint */
6276                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6277                 }
6278
6279                 if (record != NULL)
6280                 {
6281                         ErrorContextCallback errcallback;
6282                         TimestampTz xtime;
6283
6284                         InRedo = true;
6285
6286                         ereport(LOG,
6287                                         (errmsg("redo starts at %X/%X",
6288                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6289
6290                         /*
6291                          * main redo apply loop
6292                          */
6293                         do
6294                         {
6295                                 bool            switchedTLI = false;
6296
6297 #ifdef WAL_DEBUG
6298                                 if (XLOG_DEBUG ||
6299                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6300                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6301                                 {
6302                                         StringInfoData buf;
6303
6304                                         initStringInfo(&buf);
6305                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6306                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6307                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6308                                         xlog_outrec(&buf, xlogreader);
6309                                         appendStringInfoString(&buf, " - ");
6310                                         xlog_outdesc(&buf, xlogreader);
6311                                         elog(LOG, "%s", buf.data);
6312                                         pfree(buf.data);
6313                                 }
6314 #endif
6315
6316                                 /* Handle interrupt signals of startup process */
6317                                 HandleStartupProcInterrupts();
6318
6319                                 /*
6320                                  * Pause WAL replay, if requested by a hot-standby session via
6321                                  * SetRecoveryPause().
6322                                  *
6323                                  * Note that we intentionally don't take the info_lck spinlock
6324                                  * here.  We might therefore read a slightly stale value of
6325                                  * the recoveryPause flag, but it can't be very stale (no
6326                                  * worse than the last spinlock we did acquire).  Since a
6327                                  * pause request is a pretty asynchronous thing anyway,
6328                                  * possibly responding to it one WAL record later than we
6329                                  * otherwise would is a minor issue, so it doesn't seem worth
6330                                  * adding another spinlock cycle to prevent that.
6331                                  */
6332                                 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
6333                                         recoveryPausesHere();
6334
6335                                 /*
6336                                  * Have we reached our recovery target?
6337                                  */
6338                                 if (recoveryStopsBefore(xlogreader))
6339                                 {
6340                                         reachedStopPoint = true;        /* see below */
6341                                         break;
6342                                 }
6343
6344                                 /*
6345                                  * If we've been asked to lag the master, wait on latch until
6346                                  * enough time has passed.
6347                                  */
6348                                 if (recoveryApplyDelay(xlogreader))
6349                                 {
6350                                         /*
6351                                          * We test for paused recovery again here. If user sets
6352                                          * delayed apply, it may be because they expect to pause
6353                                          * recovery in case of problems, so we must test again
6354                                          * here otherwise pausing during the delay-wait wouldn't
6355                                          * work.
6356                                          */
6357                                         if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
6358                                                 recoveryPausesHere();
6359                                 }
6360
6361                                 /* Setup error traceback support for ereport() */
6362                                 errcallback.callback = rm_redo_error_callback;
6363                                 errcallback.arg = (void *) xlogreader;
6364                                 errcallback.previous = error_context_stack;
6365                                 error_context_stack = &errcallback;
6366
6367                                 /*
6368                                  * ShmemVariableCache->nextXid must be beyond record's xid.
6369                                  *
6370                                  * We don't expect anyone else to modify nextXid, hence we
6371                                  * don't need to hold a lock while examining it.  We still
6372                                  * acquire the lock to modify it, though.
6373                                  */
6374                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6375                                                                                                  ShmemVariableCache->nextXid))
6376                                 {
6377                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6378                                         ShmemVariableCache->nextXid = record->xl_xid;
6379                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6380                                         LWLockRelease(XidGenLock);
6381                                 }
6382
6383                                 /*
6384                                  * Before replaying this record, check if this record causes
6385                                  * the current timeline to change. The record is already
6386                                  * considered to be part of the new timeline, so we update
6387                                  * ThisTimeLineID before replaying it. That's important so
6388                                  * that replayEndTLI, which is recorded as the minimum
6389                                  * recovery point's TLI if recovery stops after this record,
6390                                  * is set correctly.
6391                                  */
6392                                 if (record->xl_rmid == RM_XLOG_ID)
6393                                 {
6394                                         TimeLineID      newTLI = ThisTimeLineID;
6395                                         TimeLineID      prevTLI = ThisTimeLineID;
6396                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6397
6398                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
6399                                         {
6400                                                 CheckPoint      checkPoint;
6401
6402                                                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6403                                                 newTLI = checkPoint.ThisTimeLineID;
6404                                                 prevTLI = checkPoint.PrevTimeLineID;
6405                                         }
6406                                         else if (info == XLOG_END_OF_RECOVERY)
6407                                         {
6408                                                 xl_end_of_recovery xlrec;
6409
6410                                                 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
6411                                                 newTLI = xlrec.ThisTimeLineID;
6412                                                 prevTLI = xlrec.PrevTimeLineID;
6413                                         }
6414
6415                                         if (newTLI != ThisTimeLineID)
6416                                         {
6417                                                 /* Check that it's OK to switch to this TLI */
6418                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
6419
6420                                                 /* Following WAL records should be run with new TLI */
6421                                                 ThisTimeLineID = newTLI;
6422                                                 switchedTLI = true;
6423                                         }
6424                                 }
6425
6426                                 /*
6427                                  * Update shared replayEndRecPtr before replaying this record,
6428                                  * so that XLogFlush will update minRecoveryPoint correctly.
6429                                  */
6430                                 SpinLockAcquire(&XLogCtl->info_lck);
6431                                 XLogCtl->replayEndRecPtr = EndRecPtr;
6432                                 XLogCtl->replayEndTLI = ThisTimeLineID;
6433                                 SpinLockRelease(&XLogCtl->info_lck);
6434
6435                                 /*
6436                                  * If we are attempting to enter Hot Standby mode, process
6437                                  * XIDs we see
6438                                  */
6439                                 if (standbyState >= STANDBY_INITIALIZED &&
6440                                         TransactionIdIsValid(record->xl_xid))
6441                                         RecordKnownAssignedTransactionIds(record->xl_xid);
6442
6443                                 /* Now apply the WAL record itself */
6444                                 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
6445
6446                                 /* Pop the error context stack */
6447                                 error_context_stack = errcallback.previous;
6448
6449                                 /*
6450                                  * Update lastReplayedEndRecPtr after this record has been
6451                                  * successfully replayed.
6452                                  */
6453                                 SpinLockAcquire(&XLogCtl->info_lck);
6454                                 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
6455                                 XLogCtl->lastReplayedTLI = ThisTimeLineID;
6456                                 SpinLockRelease(&XLogCtl->info_lck);
6457
6458                                 /* Remember this record as the last-applied one */
6459                                 LastRec = ReadRecPtr;
6460
6461                                 /* Allow read-only connections if we're consistent now */
6462                                 CheckRecoveryConsistency();
6463
6464                                 /*
6465                                  * If this record was a timeline switch, wake up any
6466                                  * walsenders to notice that we are on a new timeline.
6467                                  */
6468                                 if (switchedTLI && AllowCascadeReplication())
6469                                         WalSndWakeup();
6470
6471                                 /* Exit loop if we reached inclusive recovery target */
6472                                 if (recoveryStopsAfter(xlogreader))
6473                                 {
6474                                         reachedStopPoint = true;
6475                                         break;
6476                                 }
6477
6478                                 /* Else, try to fetch the next WAL record */
6479                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6480                         } while (record != NULL);
6481
6482                         /*
6483                          * end of main redo apply loop
6484                          */
6485
6486                         if (reachedStopPoint)
6487                         {
6488                                 if (!reachedConsistency)
6489                                         ereport(FATAL,
6490                                                 (errmsg("requested recovery stop point is before consistent recovery point")));
6491
6492                                 /*
6493                                  * This is the last point where we can restart recovery with a
6494                                  * new recovery target, if we shutdown and begin again. After
6495                                  * this, Resource Managers may choose to do permanent corrective
6496                                  * actions at end of recovery.
6497                                  */
6498                                 switch (recoveryTargetAction)
6499                                 {
6500                                         case RECOVERY_TARGET_ACTION_SHUTDOWN:
6501                                                         /*
6502                                                          * exit with special return code to request shutdown
6503                                                          * of postmaster.  Log messages issued from
6504                                                          * postmaster.
6505                                                          */
6506                                                         proc_exit(3);
6507
6508                                         case RECOVERY_TARGET_ACTION_PAUSE:
6509                                                         SetRecoveryPause(true);
6510                                                         recoveryPausesHere();
6511
6512                                                         /* drop into promote */
6513
6514                                         case RECOVERY_TARGET_ACTION_PROMOTE:
6515                                                         break;
6516                                 }
6517                         }
6518
6519                         /* Allow resource managers to do any required cleanup. */
6520                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6521                         {
6522                                 if (RmgrTable[rmid].rm_cleanup != NULL)
6523                                         RmgrTable[rmid].rm_cleanup();
6524                         }
6525
6526                         ereport(LOG,
6527                                         (errmsg("redo done at %X/%X",
6528                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6529                         xtime = GetLatestXTime();
6530                         if (xtime)
6531                                 ereport(LOG,
6532                                          (errmsg("last completed transaction was at log time %s",
6533                                                          timestamptz_to_str(xtime))));
6534
6535                         InRedo = false;
6536                 }
6537                 else
6538                 {
6539                         /* there are no WAL records following the checkpoint */
6540                         ereport(LOG,
6541                                         (errmsg("redo is not required")));
6542                 }
6543         }
6544
6545         /*
6546          * Kill WAL receiver, if it's still running, before we continue to write
6547          * the startup checkpoint record. It will trump over the checkpoint and
6548          * subsequent records if it's still alive when we start writing WAL.
6549          */
6550         ShutdownWalRcv();
6551
6552         /*
6553          * Reset unlogged relations to the contents of their INIT fork. This is
6554          * done AFTER recovery is complete so as to include any unlogged relations
6555          * created during recovery, but BEFORE recovery is marked as having
6556          * completed successfully. Otherwise we'd not retry if any of the post
6557          * end-of-recovery steps fail.
6558          */
6559         if (InRecovery)
6560                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
6561
6562         /*
6563          * We don't need the latch anymore. It's not strictly necessary to disown
6564          * it, but let's do it for the sake of tidiness.
6565          */
6566         if (StandbyModeRequested)
6567                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
6568
6569         /*
6570          * We are now done reading the xlog from stream. Turn off streaming
6571          * recovery to force fetching the files (which would be required at end of
6572          * recovery, e.g., timeline history file) from archive or pg_xlog.
6573          */
6574         StandbyMode = false;
6575
6576         /*
6577          * Re-fetch the last valid or last applied record, so we can identify the
6578          * exact endpoint of what we consider the valid portion of WAL.
6579          */
6580         record = ReadRecord(xlogreader, LastRec, PANIC, false);
6581         EndOfLog = EndRecPtr;
6582         XLByteToPrevSeg(EndOfLog, endLogSegNo);
6583
6584         /*
6585          * Complain if we did not roll forward far enough to render the backup
6586          * dump consistent.  Note: it is indeed okay to look at the local variable
6587          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
6588          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
6589          * advanced beyond the WAL we processed.
6590          */
6591         if (InRecovery &&
6592                 (EndOfLog < minRecoveryPoint ||
6593                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6594         {
6595                 /*
6596                  * Ran off end of WAL before reaching end-of-backup WAL record, or
6597                  * minRecoveryPoint. That's usually a bad sign, indicating that you
6598                  * tried to recover from an online backup but never called
6599                  * pg_stop_backup(), or you didn't archive all the WAL up to that
6600                  * point. However, this also happens in crash recovery, if the system
6601                  * crashes while an online backup is in progress. We must not treat
6602                  * that as an error, or the database will refuse to start up.
6603                  */
6604                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
6605                 {
6606                         if (ControlFile->backupEndRequired)
6607                                 ereport(FATAL,
6608                                                 (errmsg("WAL ends before end of online backup"),
6609                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
6610                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6611                                 ereport(FATAL,
6612                                                 (errmsg("WAL ends before end of online backup"),
6613                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
6614                         else
6615                                 ereport(FATAL,
6616                                           (errmsg("WAL ends before consistent recovery point")));
6617                 }
6618         }
6619
6620         /*
6621          * Consider whether we need to assign a new timeline ID.
6622          *
6623          * If we are doing an archive recovery, we always assign a new ID.  This
6624          * handles a couple of issues.  If we stopped short of the end of WAL
6625          * during recovery, then we are clearly generating a new timeline and must
6626          * assign it a unique new ID.  Even if we ran to the end, modifying the
6627          * current last segment is problematic because it may result in trying to
6628          * overwrite an already-archived copy of that segment, and we encourage
6629          * DBAs to make their archive_commands reject that.  We can dodge the
6630          * problem by making the new active segment have a new timeline ID.
6631          *
6632          * In a normal crash recovery, we can just extend the timeline we were in.
6633          */
6634         PrevTimeLineID = ThisTimeLineID;
6635         if (ArchiveRecoveryRequested)
6636         {
6637                 char            reason[200];
6638
6639                 Assert(InArchiveRecovery);
6640
6641                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
6642                 ereport(LOG,
6643                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
6644
6645                 /*
6646                  * Create a comment for the history file to explain why and where
6647                  * timeline changed.
6648                  */
6649                 if (recoveryTarget == RECOVERY_TARGET_XID)
6650                         snprintf(reason, sizeof(reason),
6651                                          "%s transaction %u",
6652                                          recoveryStopAfter ? "after" : "before",
6653                                          recoveryStopXid);
6654                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6655                         snprintf(reason, sizeof(reason),
6656                                          "%s %s\n",
6657                                          recoveryStopAfter ? "after" : "before",
6658                                          timestamptz_to_str(recoveryStopTime));
6659                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6660                         snprintf(reason, sizeof(reason),
6661                                          "at restore point \"%s\"",
6662                                          recoveryStopName);
6663                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6664                         snprintf(reason, sizeof(reason), "reached consistency");
6665                 else
6666                         snprintf(reason, sizeof(reason), "no recovery target specified");
6667
6668                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
6669                                                          EndRecPtr, reason);
6670         }
6671
6672         /* Save the selected TimeLineID in shared memory, too */
6673         XLogCtl->ThisTimeLineID = ThisTimeLineID;
6674         XLogCtl->PrevTimeLineID = PrevTimeLineID;
6675
6676         /*
6677          * We are now done reading the old WAL.  Turn off archive fetching if it
6678          * was active, and make a writable copy of the last WAL segment. (Note
6679          * that we also have a copy of the last block of the old WAL in readBuf;
6680          * we will use that below.)
6681          */
6682         if (ArchiveRecoveryRequested)
6683                 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
6684
6685         /*
6686          * Prepare to write WAL starting at EndOfLog position, and init xlog
6687          * buffer cache using the block containing the last record from the
6688          * previous incarnation.
6689          */
6690         openLogSegNo = endLogSegNo;
6691         openLogFile = XLogFileOpen(openLogSegNo);
6692         openLogOff = 0;
6693         Insert = &XLogCtl->Insert;
6694         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
6695         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
6696
6697         /*
6698          * Tricky point here: readBuf contains the *last* block that the LastRec
6699          * record spans, not the one it starts in.  The last block is indeed the
6700          * one we want to use.
6701          */
6702         if (EndOfLog % XLOG_BLCKSZ != 0)
6703         {
6704                 char       *page;
6705                 int                     len;
6706                 int                     firstIdx;
6707                 XLogRecPtr      pageBeginPtr;
6708
6709                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
6710                 Assert(readOff == pageBeginPtr % XLogSegSize);
6711
6712                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
6713
6714                 /* Copy the valid part of the last block, and zero the rest */
6715                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
6716                 len = EndOfLog % XLOG_BLCKSZ;
6717                 memcpy(page, xlogreader->readBuf, len);
6718                 memset(page + len, 0, XLOG_BLCKSZ - len);
6719
6720                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
6721                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
6722         }
6723         else
6724         {
6725                 /*
6726                  * There is no partial block to copy. Just set InitializedUpTo, and
6727                  * let the first attempt to insert a log record to initialize the next
6728                  * buffer.
6729                  */
6730                 XLogCtl->InitializedUpTo = EndOfLog;
6731         }
6732
6733         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
6734
6735         XLogCtl->LogwrtResult = LogwrtResult;
6736
6737         XLogCtl->LogwrtRqst.Write = EndOfLog;
6738         XLogCtl->LogwrtRqst.Flush = EndOfLog;
6739
6740         /* Pre-scan prepared transactions to find out the range of XIDs present */
6741         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6742
6743         /*
6744          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
6745          * record before resource manager writes cleanup WAL records or checkpoint
6746          * record is written.
6747          */
6748         Insert->fullPageWrites = lastFullPageWrites;
6749         LocalSetXLogInsertAllowed();
6750         UpdateFullPageWrites();
6751         LocalXLogInsertAllowed = -1;
6752
6753         if (InRecovery)
6754         {
6755                 /*
6756                  * Perform a checkpoint to update all our recovery activity to disk.
6757                  *
6758                  * Note that we write a shutdown checkpoint rather than an on-line
6759                  * one. This is not particularly critical, but since we may be
6760                  * assigning a new TLI, using a shutdown checkpoint allows us to have
6761                  * the rule that TLI only changes in shutdown checkpoints, which
6762                  * allows some extra error checking in xlog_redo.
6763                  *
6764                  * In fast promotion, only create a lightweight end-of-recovery record
6765                  * instead of a full checkpoint. A checkpoint is requested later,
6766                  * after we're fully out of recovery mode and already accepting
6767                  * queries.
6768                  */
6769                 if (bgwriterLaunched)
6770                 {
6771                         if (fast_promote)
6772                         {
6773                                 checkPointLoc = ControlFile->prevCheckPoint;
6774
6775                                 /*
6776                                  * Confirm the last checkpoint is available for us to recover
6777                                  * from if we fail. Note that we don't check for the secondary
6778                                  * checkpoint since that isn't available in most base backups.
6779                                  */
6780                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
6781                                 if (record != NULL)
6782                                 {
6783                                         fast_promoted = true;
6784
6785                                         /*
6786                                          * Insert a special WAL record to mark the end of
6787                                          * recovery, since we aren't doing a checkpoint. That
6788                                          * means that the checkpointer process may likely be in
6789                                          * the middle of a time-smoothed restartpoint and could
6790                                          * continue to be for minutes after this. That sounds
6791                                          * strange, but the effect is roughly the same and it
6792                                          * would be stranger to try to come out of the
6793                                          * restartpoint and then checkpoint. We request a
6794                                          * checkpoint later anyway, just for safety.
6795                                          */
6796                                         CreateEndOfRecoveryRecord();
6797                                 }
6798                         }
6799
6800                         if (!fast_promoted)
6801                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
6802                                                                   CHECKPOINT_IMMEDIATE |
6803                                                                   CHECKPOINT_WAIT);
6804                 }
6805                 else
6806                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6807
6808                 /*
6809                  * And finally, execute the recovery_end_command, if any.
6810                  */
6811                 if (recoveryEndCommand)
6812                         ExecuteRecoveryCommand(recoveryEndCommand,
6813                                                                    "recovery_end_command",
6814                                                                    true);
6815         }
6816
6817         /*
6818          * Preallocate additional log files, if wanted.
6819          */
6820         PreallocXlogFiles(EndOfLog);
6821
6822         /*
6823          * Okay, we're officially UP.
6824          */
6825         InRecovery = false;
6826
6827         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6828         ControlFile->state = DB_IN_PRODUCTION;
6829         ControlFile->time = (pg_time_t) time(NULL);
6830         UpdateControlFile();
6831         LWLockRelease(ControlFileLock);
6832
6833         /* start the archive_timeout timer running */
6834         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
6835
6836         /* also initialize latestCompletedXid, to nextXid - 1 */
6837         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6838         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
6839         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
6840         LWLockRelease(ProcArrayLock);
6841
6842         /*
6843          * Start up the commit log, commit timestamp and subtrans, if not already
6844          * done for hot standby.
6845          */
6846         if (standbyState == STANDBY_DISABLED)
6847         {
6848                 StartupCLOG();
6849                 StartupCommitTs();
6850                 StartupSUBTRANS(oldestActiveXID);
6851         }
6852
6853         /*
6854          * Perform end of recovery actions for any SLRUs that need it.
6855          */
6856         TrimCLOG();
6857         TrimMultiXact();
6858
6859         /* Reload shared-memory state for prepared transactions */
6860         RecoverPreparedTransactions();
6861
6862         /*
6863          * Shutdown the recovery environment. This must occur after
6864          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
6865          */
6866         if (standbyState != STANDBY_DISABLED)
6867                 ShutdownRecoveryTransactionEnvironment();
6868
6869         /* Shut down xlogreader */
6870         if (readFile >= 0)
6871         {
6872                 close(readFile);
6873                 readFile = -1;
6874         }
6875         XLogReaderFree(xlogreader);
6876
6877         /*
6878          * If any of the critical GUCs have changed, log them before we allow
6879          * backends to write WAL.
6880          */
6881         LocalSetXLogInsertAllowed();
6882         XLogReportParameters();
6883
6884         /*
6885          * Local WAL inserts enabled, so it's time to finish initialization
6886          * of commit timestamp.
6887          */
6888         CompleteCommitTsInitialization();
6889
6890         /*
6891          * All done.  Allow backends to write WAL.  (Although the bool flag is
6892          * probably atomic in itself, we use the info_lck here to ensure that
6893          * there are no race conditions concerning visibility of other recent
6894          * updates to shared memory.)
6895          */
6896         SpinLockAcquire(&XLogCtl->info_lck);
6897         XLogCtl->SharedRecoveryInProgress = false;
6898         SpinLockRelease(&XLogCtl->info_lck);
6899
6900         /*
6901          * If there were cascading standby servers connected to us, nudge any wal
6902          * sender processes to notice that we've been promoted.
6903          */
6904         WalSndWakeup();
6905
6906         /*
6907          * If this was a fast promotion, request an (online) checkpoint now. This
6908          * isn't required for consistency, but the last restartpoint might be far
6909          * back, and in case of a crash, recovering from it might take a longer
6910          * than is appropriate now that we're not in standby mode anymore.
6911          */
6912         if (fast_promoted)
6913                 RequestCheckpoint(CHECKPOINT_FORCE);
6914 }
6915
6916 /*
6917  * Checks if recovery has reached a consistent state. When consistency is
6918  * reached and we have a valid starting standby snapshot, tell postmaster
6919  * that it can start accepting read-only connections.
6920  */
6921 static void
6922 CheckRecoveryConsistency(void)
6923 {
6924         XLogRecPtr      lastReplayedEndRecPtr;
6925
6926         /*
6927          * During crash recovery, we don't reach a consistent state until we've
6928          * replayed all the WAL.
6929          */
6930         if (XLogRecPtrIsInvalid(minRecoveryPoint))
6931                 return;
6932
6933         /*
6934          * assume that we are called in the startup process, and hence don't need
6935          * a lock to read lastReplayedEndRecPtr
6936          */
6937         lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
6938
6939         /*
6940          * Have we reached the point where our base backup was completed?
6941          */
6942         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
6943                 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
6944         {
6945                 /*
6946                  * We have reached the end of base backup, as indicated by pg_control.
6947                  * The data on disk is now consistent. Reset backupStartPoint and
6948                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
6949                  * allow starting up at an earlier point even if recovery is stopped
6950                  * and restarted soon after this.
6951                  */
6952                 elog(DEBUG1, "end of backup reached");
6953
6954                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6955
6956                 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
6957                         ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
6958
6959                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
6960                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
6961                 ControlFile->backupEndRequired = false;
6962                 UpdateControlFile();
6963
6964                 LWLockRelease(ControlFileLock);
6965         }
6966
6967         /*
6968          * Have we passed our safe starting point? Note that minRecoveryPoint is
6969          * known to be incorrectly set if ControlFile->backupEndRequired, until
6970          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
6971          * minRecoveryPoint. All we know prior to that is that we're not
6972          * consistent yet.
6973          */
6974         if (!reachedConsistency && !ControlFile->backupEndRequired &&
6975                 minRecoveryPoint <= lastReplayedEndRecPtr &&
6976                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6977         {
6978                 /*
6979                  * Check to see if the XLOG sequence contained any unresolved
6980                  * references to uninitialized pages.
6981                  */
6982                 XLogCheckInvalidPages();
6983
6984                 reachedConsistency = true;
6985                 ereport(LOG,
6986                                 (errmsg("consistent recovery state reached at %X/%X",
6987                                                 (uint32) (lastReplayedEndRecPtr >> 32),
6988                                                 (uint32) lastReplayedEndRecPtr)));
6989         }
6990
6991         /*
6992          * Have we got a valid starting snapshot that will allow queries to be
6993          * run? If so, we can tell postmaster that the database is consistent now,
6994          * enabling connections.
6995          */
6996         if (standbyState == STANDBY_SNAPSHOT_READY &&
6997                 !LocalHotStandbyActive &&
6998                 reachedConsistency &&
6999                 IsUnderPostmaster)
7000         {
7001                 SpinLockAcquire(&XLogCtl->info_lck);
7002                 XLogCtl->SharedHotStandbyActive = true;
7003                 SpinLockRelease(&XLogCtl->info_lck);
7004
7005                 LocalHotStandbyActive = true;
7006
7007                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7008         }
7009 }
7010
7011 /*
7012  * Is the system still in recovery?
7013  *
7014  * Unlike testing InRecovery, this works in any process that's connected to
7015  * shared memory.
7016  *
7017  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7018  * variables the first time we see that recovery is finished.
7019  */
7020 bool
7021 RecoveryInProgress(void)
7022 {
7023         /*
7024          * We check shared state each time only until we leave recovery mode. We
7025          * can't re-enter recovery, so there's no need to keep checking after the
7026          * shared variable has once been seen false.
7027          */
7028         if (!LocalRecoveryInProgress)
7029                 return false;
7030         else
7031         {
7032                 /*
7033                  * use volatile pointer to make sure we make a fresh read of the
7034                  * shared variable.
7035                  */
7036                 volatile XLogCtlData *xlogctl = XLogCtl;
7037
7038                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7039
7040                 /*
7041                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7042                  * is finished. InitPostgres() relies upon this behaviour to ensure
7043                  * that InitXLOGAccess() is called at backend startup.  (If you change
7044                  * this, see also LocalSetXLogInsertAllowed.)
7045                  */
7046                 if (!LocalRecoveryInProgress)
7047                 {
7048                         /*
7049                          * If we just exited recovery, make sure we read TimeLineID and
7050                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
7051                          * weak memory ordering).
7052                          */
7053                         pg_memory_barrier();
7054                         InitXLOGAccess();
7055                 }
7056
7057                 /*
7058                  * Note: We don't need a memory barrier when we're still in recovery.
7059                  * We might exit recovery immediately after return, so the caller
7060                  * can't rely on 'true' meaning that we're still in recovery anyway.
7061                  */
7062
7063                 return LocalRecoveryInProgress;
7064         }
7065 }
7066
7067 /*
7068  * Is HotStandby active yet? This is only important in special backends
7069  * since normal backends won't ever be able to connect until this returns
7070  * true. Postmaster knows this by way of signal, not via shared memory.
7071  *
7072  * Unlike testing standbyState, this works in any process that's connected to
7073  * shared memory.  (And note that standbyState alone doesn't tell the truth
7074  * anyway.)
7075  */
7076 bool
7077 HotStandbyActive(void)
7078 {
7079         /*
7080          * We check shared state each time only until Hot Standby is active. We
7081          * can't de-activate Hot Standby, so there's no need to keep checking
7082          * after the shared variable has once been seen true.
7083          */
7084         if (LocalHotStandbyActive)
7085                 return true;
7086         else
7087         {
7088                 /* spinlock is essential on machines with weak memory ordering! */
7089                 SpinLockAcquire(&XLogCtl->info_lck);
7090                 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
7091                 SpinLockRelease(&XLogCtl->info_lck);
7092
7093                 return LocalHotStandbyActive;
7094         }
7095 }
7096
7097 /*
7098  * Like HotStandbyActive(), but to be used only in WAL replay code,
7099  * where we don't need to ask any other process what the state is.
7100  */
7101 bool
7102 HotStandbyActiveInReplay(void)
7103 {
7104         Assert(AmStartupProcess());
7105         return LocalHotStandbyActive;
7106 }
7107
7108 /*
7109  * Is this process allowed to insert new WAL records?
7110  *
7111  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7112  * But we also have provisions for forcing the result "true" or "false"
7113  * within specific processes regardless of the global state.
7114  */
7115 bool
7116 XLogInsertAllowed(void)
7117 {
7118         /*
7119          * If value is "unconditionally true" or "unconditionally false", just
7120          * return it.  This provides the normal fast path once recovery is known
7121          * done.
7122          */
7123         if (LocalXLogInsertAllowed >= 0)
7124                 return (bool) LocalXLogInsertAllowed;
7125
7126         /*
7127          * Else, must check to see if we're still in recovery.
7128          */
7129         if (RecoveryInProgress())
7130                 return false;
7131
7132         /*
7133          * On exit from recovery, reset to "unconditionally true", since there is
7134          * no need to keep checking.
7135          */
7136         LocalXLogInsertAllowed = 1;
7137         return true;
7138 }
7139
7140 /*
7141  * Make XLogInsertAllowed() return true in the current process only.
7142  *
7143  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7144  * and even call LocalSetXLogInsertAllowed() again after that.
7145  */
7146 static void
7147 LocalSetXLogInsertAllowed(void)
7148 {
7149         Assert(LocalXLogInsertAllowed == -1);
7150         LocalXLogInsertAllowed = 1;
7151
7152         /* Initialize as RecoveryInProgress() would do when switching state */
7153         InitXLOGAccess();
7154 }
7155
7156 /*
7157  * Subroutine to try to fetch and validate a prior checkpoint record.
7158  *
7159  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7160  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7161  */
7162 static XLogRecord *
7163 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7164                                          int whichChkpt, bool report)
7165 {
7166         XLogRecord *record;
7167
7168         if (!XRecOffIsValid(RecPtr))
7169         {
7170                 if (!report)
7171                         return NULL;
7172
7173                 switch (whichChkpt)
7174                 {
7175                         case 1:
7176                                 ereport(LOG,
7177                                 (errmsg("invalid primary checkpoint link in control file")));
7178                                 break;
7179                         case 2:
7180                                 ereport(LOG,
7181                                                 (errmsg("invalid secondary checkpoint link in control file")));
7182                                 break;
7183                         default:
7184                                 ereport(LOG,
7185                                    (errmsg("invalid checkpoint link in backup_label file")));
7186                                 break;
7187                 }
7188                 return NULL;
7189         }
7190
7191         record = ReadRecord(xlogreader, RecPtr, LOG, true);
7192
7193         if (record == NULL)
7194         {
7195                 if (!report)
7196                         return NULL;
7197
7198                 switch (whichChkpt)
7199                 {
7200                         case 1:
7201                                 ereport(LOG,
7202                                                 (errmsg("invalid primary checkpoint record")));
7203                                 break;
7204                         case 2:
7205                                 ereport(LOG,
7206                                                 (errmsg("invalid secondary checkpoint record")));
7207                                 break;
7208                         default:
7209                                 ereport(LOG,
7210                                                 (errmsg("invalid checkpoint record")));
7211                                 break;
7212                 }
7213                 return NULL;
7214         }
7215         if (record->xl_rmid != RM_XLOG_ID)
7216         {
7217                 switch (whichChkpt)
7218                 {
7219                         case 1:
7220                                 ereport(LOG,
7221                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7222                                 break;
7223                         case 2:
7224                                 ereport(LOG,
7225                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7226                                 break;
7227                         default:
7228                                 ereport(LOG,
7229                                 (errmsg("invalid resource manager ID in checkpoint record")));
7230                                 break;
7231                 }
7232                 return NULL;
7233         }
7234         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7235                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7236         {
7237                 switch (whichChkpt)
7238                 {
7239                         case 1:
7240                                 ereport(LOG,
7241                                    (errmsg("invalid xl_info in primary checkpoint record")));
7242                                 break;
7243                         case 2:
7244                                 ereport(LOG,
7245                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7246                                 break;
7247                         default:
7248                                 ereport(LOG,
7249                                                 (errmsg("invalid xl_info in checkpoint record")));
7250                                 break;
7251                 }
7252                 return NULL;
7253         }
7254         if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
7255         {
7256                 switch (whichChkpt)
7257                 {
7258                         case 1:
7259                                 ereport(LOG,
7260                                         (errmsg("invalid length of primary checkpoint record")));
7261                                 break;
7262                         case 2:
7263                                 ereport(LOG,
7264                                   (errmsg("invalid length of secondary checkpoint record")));
7265                                 break;
7266                         default:
7267                                 ereport(LOG,
7268                                                 (errmsg("invalid length of checkpoint record")));
7269                                 break;
7270                 }
7271                 return NULL;
7272         }
7273         return record;
7274 }
7275
7276 /*
7277  * This must be called in a backend process before creating WAL records
7278  * (except in a standalone backend, which does StartupXLOG instead).  We need
7279  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
7280  *
7281  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7282  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7283  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7284  */
7285 void
7286 InitXLOGAccess(void)
7287 {
7288         XLogCtlInsert *Insert = &XLogCtl->Insert;
7289
7290         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7291         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7292         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7293
7294         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7295         (void) GetRedoRecPtr();
7296         /* Also update our copy of doPageWrites. */
7297         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
7298
7299         /* Also initialize the working areas for constructing WAL records */
7300         InitXLogInsert();
7301 }
7302
7303 /*
7304  * Return the current Redo pointer from shared memory.
7305  *
7306  * As a side-effect, the local RedoRecPtr copy is updated.
7307  */
7308 XLogRecPtr
7309 GetRedoRecPtr(void)
7310 {
7311         XLogRecPtr      ptr;
7312
7313         /*
7314          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
7315          * grabbed a WAL insertion lock to read the master copy, someone might
7316          * update it just after we've released the lock.
7317          */
7318         SpinLockAcquire(&XLogCtl->info_lck);
7319         ptr = XLogCtl->RedoRecPtr;
7320         SpinLockRelease(&XLogCtl->info_lck);
7321
7322         if (RedoRecPtr < ptr)
7323                 RedoRecPtr = ptr;
7324
7325         return RedoRecPtr;
7326 }
7327
7328 /*
7329  * Return information needed to decide whether a modified block needs a
7330  * full-page image to be included in the WAL record.
7331  *
7332  * The returned values are cached copies from backend-private memory, and
7333  * possibly out-of-date.  XLogInsertRecord will re-check them against
7334  * up-to-date values, while holding the WAL insert lock.
7335  */
7336 void
7337 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
7338 {
7339         *RedoRecPtr_p = RedoRecPtr;
7340         *doPageWrites_p = doPageWrites;
7341 }
7342
7343 /*
7344  * GetInsertRecPtr -- Returns the current insert position.
7345  *
7346  * NOTE: The value *actually* returned is the position of the last full
7347  * xlog page. It lags behind the real insert position by at most 1 page.
7348  * For that, we don't need to scan through WAL insertion locks, and an
7349  * approximation is enough for the current usage of this function.
7350  */
7351 XLogRecPtr
7352 GetInsertRecPtr(void)
7353 {
7354         XLogRecPtr      recptr;
7355
7356         SpinLockAcquire(&XLogCtl->info_lck);
7357         recptr = XLogCtl->LogwrtRqst.Write;
7358         SpinLockRelease(&XLogCtl->info_lck);
7359
7360         return recptr;
7361 }
7362
7363 /*
7364  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7365  * position known to be fsync'd to disk.
7366  */
7367 XLogRecPtr
7368 GetFlushRecPtr(void)
7369 {
7370         XLogRecPtr      recptr;
7371
7372         SpinLockAcquire(&XLogCtl->info_lck);
7373         recptr = XLogCtl->LogwrtResult.Flush;
7374         SpinLockRelease(&XLogCtl->info_lck);
7375
7376         return recptr;
7377 }
7378
7379 /*
7380  * Get the time of the last xlog segment switch
7381  */
7382 pg_time_t
7383 GetLastSegSwitchTime(void)
7384 {
7385         pg_time_t       result;
7386
7387         /* Need WALWriteLock, but shared lock is sufficient */
7388         LWLockAcquire(WALWriteLock, LW_SHARED);
7389         result = XLogCtl->lastSegSwitchTime;
7390         LWLockRelease(WALWriteLock);
7391
7392         return result;
7393 }
7394
7395 /*
7396  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7397  *
7398  * This is exported for use by code that would like to have 64-bit XIDs.
7399  * We don't really support such things, but all XIDs within the system
7400  * can be presumed "close to" the result, and thus the epoch associated
7401  * with them can be determined.
7402  */
7403 void
7404 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7405 {
7406         uint32          ckptXidEpoch;
7407         TransactionId ckptXid;
7408         TransactionId nextXid;
7409
7410         /* Must read checkpoint info first, else have race condition */
7411         SpinLockAcquire(&XLogCtl->info_lck);
7412         ckptXidEpoch = XLogCtl->ckptXidEpoch;
7413         ckptXid = XLogCtl->ckptXid;
7414         SpinLockRelease(&XLogCtl->info_lck);
7415
7416         /* Now fetch current nextXid */
7417         nextXid = ReadNewTransactionId();
7418
7419         /*
7420          * nextXid is certainly logically later than ckptXid.  So if it's
7421          * numerically less, it must have wrapped into the next epoch.
7422          */
7423         if (nextXid < ckptXid)
7424                 ckptXidEpoch++;
7425
7426         *xid = nextXid;
7427         *epoch = ckptXidEpoch;
7428 }
7429
7430 /*
7431  * This must be called ONCE during postmaster or standalone-backend shutdown
7432  */
7433 void
7434 ShutdownXLOG(int code, Datum arg)
7435 {
7436         /* Don't be chatty in standalone mode */
7437         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7438                         (errmsg("shutting down")));
7439
7440         if (RecoveryInProgress())
7441                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7442         else
7443         {
7444                 /*
7445                  * If archiving is enabled, rotate the last XLOG file so that all the
7446                  * remaining records are archived (postmaster wakes up the archiver
7447                  * process one more time at the end of shutdown). The checkpoint
7448                  * record will go to the next XLOG file and won't be archived (yet).
7449                  */
7450                 if (XLogArchivingActive() && XLogArchiveCommandSet())
7451                         RequestXLogSwitch();
7452
7453                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7454         }
7455         ShutdownCLOG();
7456         ShutdownCommitTs();
7457         ShutdownSUBTRANS();
7458         ShutdownMultiXact();
7459
7460         /* Don't be chatty in standalone mode */
7461         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7462                         (errmsg("database system is shut down")));
7463 }
7464
7465 /*
7466  * Log start of a checkpoint.
7467  */
7468 static void
7469 LogCheckpointStart(int flags, bool restartpoint)
7470 {
7471         elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
7472                  restartpoint ? "restartpoint" : "checkpoint",
7473                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7474                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7475                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
7476                  (flags & CHECKPOINT_FORCE) ? " force" : "",
7477                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
7478                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
7479                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
7480                  (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" :"");
7481 }
7482
7483 /*
7484  * Log end of a checkpoint.
7485  */
7486 static void
7487 LogCheckpointEnd(bool restartpoint)
7488 {
7489         long            write_secs,
7490                                 sync_secs,
7491                                 total_secs,
7492                                 longest_secs,
7493                                 average_secs;
7494         int                     write_usecs,
7495                                 sync_usecs,
7496                                 total_usecs,
7497                                 longest_usecs,
7498                                 average_usecs;
7499         uint64          average_sync_time;
7500
7501         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
7502
7503         TimestampDifference(CheckpointStats.ckpt_write_t,
7504                                                 CheckpointStats.ckpt_sync_t,
7505                                                 &write_secs, &write_usecs);
7506
7507         TimestampDifference(CheckpointStats.ckpt_sync_t,
7508                                                 CheckpointStats.ckpt_sync_end_t,
7509                                                 &sync_secs, &sync_usecs);
7510
7511         /* Accumulate checkpoint timing summary data, in milliseconds. */
7512         BgWriterStats.m_checkpoint_write_time +=
7513                 write_secs * 1000 + write_usecs / 1000;
7514         BgWriterStats.m_checkpoint_sync_time +=
7515                 sync_secs * 1000 + sync_usecs / 1000;
7516
7517         /*
7518          * All of the published timing statistics are accounted for.  Only
7519          * continue if a log message is to be written.
7520          */
7521         if (!log_checkpoints)
7522                 return;
7523
7524         TimestampDifference(CheckpointStats.ckpt_start_t,
7525                                                 CheckpointStats.ckpt_end_t,
7526                                                 &total_secs, &total_usecs);
7527
7528         /*
7529          * Timing values returned from CheckpointStats are in microseconds.
7530          * Convert to the second plus microsecond form that TimestampDifference
7531          * returns for homogeneous printing.
7532          */
7533         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
7534         longest_usecs = CheckpointStats.ckpt_longest_sync -
7535                 (uint64) longest_secs *1000000;
7536
7537         average_sync_time = 0;
7538         if (CheckpointStats.ckpt_sync_rels > 0)
7539                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
7540                         CheckpointStats.ckpt_sync_rels;
7541         average_secs = (long) (average_sync_time / 1000000);
7542         average_usecs = average_sync_time - (uint64) average_secs *1000000;
7543
7544         elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
7545                  "%d transaction log file(s) added, %d removed, %d recycled; "
7546                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7547                  "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7548                  restartpoint ? "restartpoint" : "checkpoint",
7549                  CheckpointStats.ckpt_bufs_written,
7550                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7551                  CheckpointStats.ckpt_segs_added,
7552                  CheckpointStats.ckpt_segs_removed,
7553                  CheckpointStats.ckpt_segs_recycled,
7554                  write_secs, write_usecs / 1000,
7555                  sync_secs, sync_usecs / 1000,
7556                  total_secs, total_usecs / 1000,
7557                  CheckpointStats.ckpt_sync_rels,
7558                  longest_secs, longest_usecs / 1000,
7559                  average_secs, average_usecs / 1000);
7560 }
7561
7562 /*
7563  * Perform a checkpoint --- either during shutdown, or on-the-fly
7564  *
7565  * flags is a bitwise OR of the following:
7566  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7567  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7568  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7569  *              ignoring checkpoint_completion_target parameter.
7570  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
7571  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7572  *              CHECKPOINT_END_OF_RECOVERY).
7573  *      CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
7574  *
7575  * Note: flags contains other bits, of interest here only for logging purposes.
7576  * In particular note that this routine is synchronous and does not pay
7577  * attention to CHECKPOINT_WAIT.
7578  *
7579  * If !shutdown then we are writing an online checkpoint. This is a very special
7580  * kind of operation and WAL record because the checkpoint action occurs over
7581  * a period of time yet logically occurs at just a single LSN. The logical
7582  * position of the WAL record (redo ptr) is the same or earlier than the
7583  * physical position. When we replay WAL we locate the checkpoint via its
7584  * physical position then read the redo ptr and actually start replay at the
7585  * earlier logical position. Note that we don't write *anything* to WAL at
7586  * the logical position, so that location could be any other kind of WAL record.
7587  * All of this mechanism allows us to continue working while we checkpoint.
7588  * As a result, timing of actions is critical here and be careful to note that
7589  * this function will likely take minutes to execute on a busy system.
7590  */
7591 void
7592 CreateCheckPoint(int flags)
7593 {
7594         bool            shutdown;
7595         CheckPoint      checkPoint;
7596         XLogRecPtr      recptr;
7597         XLogCtlInsert *Insert = &XLogCtl->Insert;
7598         uint32          freespace;
7599         XLogSegNo       _logSegNo;
7600         XLogRecPtr      curInsert;
7601         VirtualTransactionId *vxids;
7602         int                     nvxids;
7603
7604         /*
7605          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7606          * issued at a different time.
7607          */
7608         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7609                 shutdown = true;
7610         else
7611                 shutdown = false;
7612
7613         /* sanity check */
7614         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7615                 elog(ERROR, "can't create a checkpoint during recovery");
7616
7617         /*
7618          * Initialize InitXLogInsert working areas before entering the critical
7619          * section.  Normally, this is done by the first call to
7620          * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
7621          * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
7622          * done below in a critical section, and InitXLogInsert cannot be called
7623          * in a critical section.
7624          */
7625         InitXLogInsert();
7626
7627         /*
7628          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
7629          * (This is just pro forma, since in the present system structure there is
7630          * only one process that is allowed to issue checkpoints at any given
7631          * time.)
7632          */
7633         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7634
7635         /*
7636          * Prepare to accumulate statistics.
7637          *
7638          * Note: because it is possible for log_checkpoints to change while a
7639          * checkpoint proceeds, we always accumulate stats, even if
7640          * log_checkpoints is currently off.
7641          */
7642         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7643         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7644
7645         /*
7646          * Use a critical section to force system panic if we have trouble.
7647          */
7648         START_CRIT_SECTION();
7649
7650         if (shutdown)
7651         {
7652                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7653                 ControlFile->state = DB_SHUTDOWNING;
7654                 ControlFile->time = (pg_time_t) time(NULL);
7655                 UpdateControlFile();
7656                 LWLockRelease(ControlFileLock);
7657         }
7658
7659         /*
7660          * Let smgr prepare for checkpoint; this has to happen before we determine
7661          * the REDO pointer.  Note that smgr must not do anything that'd have to
7662          * be undone if we decide no checkpoint is needed.
7663          */
7664         smgrpreckpt();
7665
7666         /* Begin filling in the checkpoint WAL record */
7667         MemSet(&checkPoint, 0, sizeof(checkPoint));
7668         checkPoint.time = (pg_time_t) time(NULL);
7669
7670         /*
7671          * For Hot Standby, derive the oldestActiveXid before we fix the redo
7672          * pointer. This allows us to begin accumulating changes to assemble our
7673          * starting snapshot of locks and transactions.
7674          */
7675         if (!shutdown && XLogStandbyInfoActive())
7676                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
7677         else
7678                 checkPoint.oldestActiveXid = InvalidTransactionId;
7679
7680         /*
7681          * We must block concurrent insertions while examining insert state to
7682          * determine the checkpoint REDO pointer.
7683          */
7684         WALInsertLockAcquireExclusive();
7685         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
7686
7687         /*
7688          * If this isn't a shutdown or forced checkpoint, and we have not inserted
7689          * any XLOG records since the start of the last checkpoint, skip the
7690          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
7691          * when the system is idle. That wastes log space, and more importantly it
7692          * exposes us to possible loss of both current and previous checkpoint
7693          * records if the machine crashes just as we're writing the update.
7694          * (Perhaps it'd make even more sense to checkpoint only when the previous
7695          * checkpoint record is in a different xlog page?)
7696          *
7697          * We have to make two tests to determine that nothing has happened since
7698          * the start of the last checkpoint: current insertion point must match
7699          * the end of the last checkpoint record, and its redo pointer must point
7700          * to itself.
7701          */
7702         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
7703                                   CHECKPOINT_FORCE)) == 0)
7704         {
7705                 if (curInsert == ControlFile->checkPoint +
7706                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
7707                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
7708                 {
7709                         WALInsertLockRelease();
7710                         LWLockRelease(CheckpointLock);
7711                         END_CRIT_SECTION();
7712                         return;
7713                 }
7714         }
7715
7716         /*
7717          * An end-of-recovery checkpoint is created before anyone is allowed to
7718          * write WAL. To allow us to write the checkpoint record, temporarily
7719          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
7720          * initialized, which we need here and in AdvanceXLInsertBuffer.)
7721          */
7722         if (flags & CHECKPOINT_END_OF_RECOVERY)
7723                 LocalSetXLogInsertAllowed();
7724
7725         checkPoint.ThisTimeLineID = ThisTimeLineID;
7726         if (flags & CHECKPOINT_END_OF_RECOVERY)
7727                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7728         else
7729                 checkPoint.PrevTimeLineID = ThisTimeLineID;
7730
7731         checkPoint.fullPageWrites = Insert->fullPageWrites;
7732
7733         /*
7734          * Compute new REDO record ptr = location of next XLOG record.
7735          *
7736          * NB: this is NOT necessarily where the checkpoint record itself will be,
7737          * since other backends may insert more XLOG records while we're off doing
7738          * the buffer flush work.  Those XLOG records are logically after the
7739          * checkpoint, even though physically before it.  Got that?
7740          */
7741         freespace = INSERT_FREESPACE(curInsert);
7742         if (freespace == 0)
7743         {
7744                 if (curInsert % XLogSegSize == 0)
7745                         curInsert += SizeOfXLogLongPHD;
7746                 else
7747                         curInsert += SizeOfXLogShortPHD;
7748         }
7749         checkPoint.redo = curInsert;
7750
7751         /*
7752          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
7753          * must be done while holding all the insertion locks.
7754          *
7755          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
7756          * pointing past where it really needs to point.  This is okay; the only
7757          * consequence is that XLogInsert might back up whole buffers that it
7758          * didn't really need to.  We can't postpone advancing RedoRecPtr because
7759          * XLogInserts that happen while we are dumping buffers must assume that
7760          * their buffer changes are not included in the checkpoint.
7761          */
7762         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
7763
7764         /*
7765          * Now we can release the WAL insertion locks, allowing other xacts to
7766          * proceed while we are flushing disk buffers.
7767          */
7768         WALInsertLockRelease();
7769
7770         /* Update the info_lck-protected copy of RedoRecPtr as well */
7771         SpinLockAcquire(&XLogCtl->info_lck);
7772         XLogCtl->RedoRecPtr = checkPoint.redo;
7773         SpinLockRelease(&XLogCtl->info_lck);
7774
7775         /*
7776          * If enabled, log checkpoint start.  We postpone this until now so as not
7777          * to log anything if we decided to skip the checkpoint.
7778          */
7779         if (log_checkpoints)
7780                 LogCheckpointStart(flags, false);
7781
7782         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
7783
7784         /*
7785          * Get the other info we need for the checkpoint record.
7786          */
7787         LWLockAcquire(XidGenLock, LW_SHARED);
7788         checkPoint.nextXid = ShmemVariableCache->nextXid;
7789         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
7790         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
7791         LWLockRelease(XidGenLock);
7792
7793         LWLockAcquire(CommitTsLock, LW_SHARED);
7794         checkPoint.oldestCommitTs = ShmemVariableCache->oldestCommitTs;
7795         checkPoint.newestCommitTs = ShmemVariableCache->newestCommitTs;
7796         LWLockRelease(CommitTsLock);
7797
7798         /* Increase XID epoch if we've wrapped around since last checkpoint */
7799         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
7800         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
7801                 checkPoint.nextXidEpoch++;
7802
7803         LWLockAcquire(OidGenLock, LW_SHARED);
7804         checkPoint.nextOid = ShmemVariableCache->nextOid;
7805         if (!shutdown)
7806                 checkPoint.nextOid += ShmemVariableCache->oidCount;
7807         LWLockRelease(OidGenLock);
7808
7809         MultiXactGetCheckptMulti(shutdown,
7810                                                          &checkPoint.nextMulti,
7811                                                          &checkPoint.nextMultiOffset,
7812                                                          &checkPoint.oldestMulti,
7813                                                          &checkPoint.oldestMultiDB);
7814
7815         /*
7816          * Having constructed the checkpoint record, ensure all shmem disk buffers
7817          * and commit-log buffers are flushed to disk.
7818          *
7819          * This I/O could fail for various reasons.  If so, we will fail to
7820          * complete the checkpoint, but there is no reason to force a system
7821          * panic. Accordingly, exit critical section while doing it.
7822          */
7823         END_CRIT_SECTION();
7824
7825         /*
7826          * In some cases there are groups of actions that must all occur on one
7827          * side or the other of a checkpoint record. Before flushing the
7828          * checkpoint record we must explicitly wait for any backend currently
7829          * performing those groups of actions.
7830          *
7831          * One example is end of transaction, so we must wait for any transactions
7832          * that are currently in commit critical sections.  If an xact inserted
7833          * its commit record into XLOG just before the REDO point, then a crash
7834          * restart from the REDO point would not replay that record, which means
7835          * that our flushing had better include the xact's update of pg_clog.  So
7836          * we wait till he's out of his commit critical section before proceeding.
7837          * See notes in RecordTransactionCommit().
7838          *
7839          * Because we've already released the insertion locks, this test is a bit
7840          * fuzzy: it is possible that we will wait for xacts we didn't really need
7841          * to wait for.  But the delay should be short and it seems better to make
7842          * checkpoint take a bit longer than to hold off insertions longer than
7843          * necessary. (In fact, the whole reason we have this issue is that xact.c
7844          * does commit record XLOG insertion and clog update as two separate steps
7845          * protected by different locks, but again that seems best on grounds of
7846          * minimizing lock contention.)
7847          *
7848          * A transaction that has not yet set delayChkpt when we look cannot be at
7849          * risk, since he's not inserted his commit record yet; and one that's
7850          * already cleared it is not at risk either, since he's done fixing clog
7851          * and we will correctly flush the update below.  So we cannot miss any
7852          * xacts we need to wait for.
7853          */
7854         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
7855         if (nvxids > 0)
7856         {
7857                 do
7858                 {
7859                         pg_usleep(10000L);      /* wait for 10 msec */
7860                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
7861         }
7862         pfree(vxids);
7863
7864         CheckPointGuts(checkPoint.redo, flags);
7865
7866         /*
7867          * Take a snapshot of running transactions and write this to WAL. This
7868          * allows us to reconstruct the state of running transactions during
7869          * archive recovery, if required. Skip, if this info disabled.
7870          *
7871          * If we are shutting down, or Startup process is completing crash
7872          * recovery we don't need to write running xact data.
7873          */
7874         if (!shutdown && XLogStandbyInfoActive())
7875                 LogStandbySnapshot();
7876
7877         START_CRIT_SECTION();
7878
7879         /*
7880          * Now insert the checkpoint record into XLOG.
7881          */
7882         XLogBeginInsert();
7883         XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
7884         recptr = XLogInsert(RM_XLOG_ID,
7885                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7886                                                 XLOG_CHECKPOINT_ONLINE);
7887
7888         XLogFlush(recptr);
7889
7890         /*
7891          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7892          * overwritten at next startup.  No-one should even try, this just allows
7893          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
7894          * to just temporarily disable writing until the system has exited
7895          * recovery.
7896          */
7897         if (shutdown)
7898         {
7899                 if (flags & CHECKPOINT_END_OF_RECOVERY)
7900                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
7901                 else
7902                         LocalXLogInsertAllowed = 0; /* never again write WAL */
7903         }
7904
7905         /*
7906          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7907          * = end of actual checkpoint record.
7908          */
7909         if (shutdown && checkPoint.redo != ProcLastRecPtr)
7910                 ereport(PANIC,
7911                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
7912
7913         /*
7914          * Select point at which we can truncate the log, which we base on the
7915          * prior checkpoint's earliest info.
7916          */
7917         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
7918
7919         /*
7920          * Update the control file.
7921          */
7922         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7923         if (shutdown)
7924                 ControlFile->state = DB_SHUTDOWNED;
7925         ControlFile->prevCheckPoint = ControlFile->checkPoint;
7926         ControlFile->checkPoint = ProcLastRecPtr;
7927         ControlFile->checkPointCopy = checkPoint;
7928         ControlFile->time = (pg_time_t) time(NULL);
7929         /* crash recovery should always recover to the end of WAL */
7930         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
7931         ControlFile->minRecoveryPointTLI = 0;
7932
7933         /*
7934          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
7935          * unused on non-shutdown checkpoints, but seems useful to store it always
7936          * for debugging purposes.
7937          */
7938         SpinLockAcquire(&XLogCtl->ulsn_lck);
7939         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
7940         SpinLockRelease(&XLogCtl->ulsn_lck);
7941
7942         UpdateControlFile();
7943         LWLockRelease(ControlFileLock);
7944
7945         /* Update shared-memory copy of checkpoint XID/epoch */
7946         SpinLockAcquire(&XLogCtl->info_lck);
7947         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
7948         XLogCtl->ckptXid = checkPoint.nextXid;
7949         SpinLockRelease(&XLogCtl->info_lck);
7950
7951         /*
7952          * We are now done with critical updates; no need for system panic if we
7953          * have trouble while fooling with old log segments.
7954          */
7955         END_CRIT_SECTION();
7956
7957         /*
7958          * Now that the checkpoint is safely on disk, we can update the point to
7959          * which multixact can be truncated.
7960          */
7961         MultiXactSetSafeTruncate(checkPoint.oldestMulti);
7962
7963         /*
7964          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7965          */
7966         smgrpostckpt();
7967
7968         /*
7969          * Delete old log files (those no longer needed even for previous
7970          * checkpoint or the standbys in XLOG streaming).
7971          */
7972         if (_logSegNo)
7973         {
7974                 KeepLogSeg(recptr, &_logSegNo);
7975                 _logSegNo--;
7976                 RemoveOldXlogFiles(_logSegNo, recptr);
7977         }
7978
7979         /*
7980          * Make more log segments if needed.  (Do this after recycling old log
7981          * segments, since that may supply some of the needed files.)
7982          */
7983         if (!shutdown)
7984                 PreallocXlogFiles(recptr);
7985
7986         /*
7987          * Truncate pg_subtrans if possible.  We can throw away all data before
7988          * the oldest XMIN of any running transaction.  No future transaction will
7989          * attempt to reference any pg_subtrans entry older than that (see Asserts
7990          * in subtrans.c).  During recovery, though, we mustn't do this because
7991          * StartupSUBTRANS hasn't been called yet.
7992          */
7993         if (!RecoveryInProgress())
7994                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
7995
7996         /*
7997          * Truncate pg_multixact too.
7998          */
7999         TruncateMultiXact();
8000
8001         /* Real work is done, but log and update stats before releasing lock. */
8002         LogCheckpointEnd(false);
8003
8004         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8005                                                                          NBuffers,
8006                                                                          CheckpointStats.ckpt_segs_added,
8007                                                                          CheckpointStats.ckpt_segs_removed,
8008                                                                          CheckpointStats.ckpt_segs_recycled);
8009
8010         LWLockRelease(CheckpointLock);
8011 }
8012
8013 /*
8014  * Mark the end of recovery in WAL though without running a full checkpoint.
8015  * We can expect that a restartpoint is likely to be in progress as we
8016  * do this, though we are unwilling to wait for it to complete. So be
8017  * careful to avoid taking the CheckpointLock anywhere here.
8018  *
8019  * CreateRestartPoint() allows for the case where recovery may end before
8020  * the restartpoint completes so there is no concern of concurrent behaviour.
8021  */
8022 static void
8023 CreateEndOfRecoveryRecord(void)
8024 {
8025         xl_end_of_recovery xlrec;
8026         XLogRecPtr      recptr;
8027
8028         /* sanity check */
8029         if (!RecoveryInProgress())
8030                 elog(ERROR, "can only be used to end recovery");
8031
8032         xlrec.end_time = time(NULL);
8033
8034         WALInsertLockAcquireExclusive();
8035         xlrec.ThisTimeLineID = ThisTimeLineID;
8036         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8037         WALInsertLockRelease();
8038
8039         LocalSetXLogInsertAllowed();
8040
8041         START_CRIT_SECTION();
8042
8043         XLogBeginInsert();
8044         XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
8045         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
8046
8047         XLogFlush(recptr);
8048
8049         /*
8050          * Update the control file so that crash recovery can follow the timeline
8051          * changes to this point.
8052          */
8053         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8054         ControlFile->time = (pg_time_t) xlrec.end_time;
8055         ControlFile->minRecoveryPoint = recptr;
8056         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8057         UpdateControlFile();
8058         LWLockRelease(ControlFileLock);
8059
8060         END_CRIT_SECTION();
8061
8062         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8063 }
8064
8065 /*
8066  * Flush all data in shared memory to disk, and fsync
8067  *
8068  * This is the common code shared between regular checkpoints and
8069  * recovery restartpoints.
8070  */
8071 static void
8072 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8073 {
8074         CheckPointCLOG();
8075         CheckPointCommitTs();
8076         CheckPointSUBTRANS();
8077         CheckPointMultiXact();
8078         CheckPointPredicate();
8079         CheckPointRelationMap();
8080         CheckPointReplicationSlots();
8081         CheckPointSnapBuild();
8082         CheckPointLogicalRewriteHeap();
8083         CheckPointBuffers(flags);       /* performs all required fsyncs */
8084         /* We deliberately delay 2PC checkpointing as long as possible */
8085         CheckPointTwoPhase(checkPointRedo);
8086 }
8087
8088 /*
8089  * Save a checkpoint for recovery restart if appropriate
8090  *
8091  * This function is called each time a checkpoint record is read from XLOG.
8092  * It must determine whether the checkpoint represents a safe restartpoint or
8093  * not.  If so, the checkpoint record is stashed in shared memory so that
8094  * CreateRestartPoint can consult it.  (Note that the latter function is
8095  * executed by the checkpointer, while this one will be executed by the
8096  * startup process.)
8097  */
8098 static void
8099 RecoveryRestartPoint(const CheckPoint *checkPoint)
8100 {
8101         /*
8102          * Also refrain from creating a restartpoint if we have seen any
8103          * references to non-existent pages. Restarting recovery from the
8104          * restartpoint would not see the references, so we would lose the
8105          * cross-check that the pages belonged to a relation that was dropped
8106          * later.
8107          */
8108         if (XLogHaveInvalidPages())
8109         {
8110                 elog(trace_recovery(DEBUG2),
8111                          "could not record restart point at %X/%X because there "
8112                          "are unresolved references to invalid pages",
8113                          (uint32) (checkPoint->redo >> 32),
8114                          (uint32) checkPoint->redo);
8115                 return;
8116         }
8117
8118         /*
8119          * Copy the checkpoint record to shared memory, so that checkpointer can
8120          * work out the next time it wants to perform a restartpoint.
8121          */
8122         SpinLockAcquire(&XLogCtl->info_lck);
8123         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
8124         XLogCtl->lastCheckPoint = *checkPoint;
8125         SpinLockRelease(&XLogCtl->info_lck);
8126 }
8127
8128 /*
8129  * Establish a restartpoint if possible.
8130  *
8131  * This is similar to CreateCheckPoint, but is used during WAL recovery
8132  * to establish a point from which recovery can roll forward without
8133  * replaying the entire recovery log.
8134  *
8135  * Returns true if a new restartpoint was established. We can only establish
8136  * a restartpoint if we have replayed a safe checkpoint record since last
8137  * restartpoint.
8138  */
8139 bool
8140 CreateRestartPoint(int flags)
8141 {
8142         XLogRecPtr      lastCheckPointRecPtr;
8143         CheckPoint      lastCheckPoint;
8144         XLogSegNo       _logSegNo;
8145         TimestampTz xtime;
8146
8147         /*
8148          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8149          * happens at a time.
8150          */
8151         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8152
8153         /* Get a local copy of the last safe checkpoint record. */
8154         SpinLockAcquire(&XLogCtl->info_lck);
8155         lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
8156         lastCheckPoint = XLogCtl->lastCheckPoint;
8157         SpinLockRelease(&XLogCtl->info_lck);
8158
8159         /*
8160          * Check that we're still in recovery mode. It's ok if we exit recovery
8161          * mode after this check, the restart point is valid anyway.
8162          */
8163         if (!RecoveryInProgress())
8164         {
8165                 ereport(DEBUG2,
8166                           (errmsg("skipping restartpoint, recovery has already ended")));
8167                 LWLockRelease(CheckpointLock);
8168                 return false;
8169         }
8170
8171         /*
8172          * If the last checkpoint record we've replayed is already our last
8173          * restartpoint, we can't perform a new restart point. We still update
8174          * minRecoveryPoint in that case, so that if this is a shutdown restart
8175          * point, we won't start up earlier than before. That's not strictly
8176          * necessary, but when hot standby is enabled, it would be rather weird if
8177          * the database opened up for read-only connections at a point-in-time
8178          * before the last shutdown. Such time travel is still possible in case of
8179          * immediate shutdown, though.
8180          *
8181          * We don't explicitly advance minRecoveryPoint when we do create a
8182          * restartpoint. It's assumed that flushing the buffers will do that as a
8183          * side-effect.
8184          */
8185         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8186                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
8187         {
8188                 ereport(DEBUG2,
8189                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8190                                                 (uint32) (lastCheckPoint.redo >> 32),
8191                                                 (uint32) lastCheckPoint.redo)));
8192
8193                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8194                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8195                 {
8196                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8197                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8198                         ControlFile->time = (pg_time_t) time(NULL);
8199                         UpdateControlFile();
8200                         LWLockRelease(ControlFileLock);
8201                 }
8202                 LWLockRelease(CheckpointLock);
8203                 return false;
8204         }
8205
8206         /*
8207          * Update the shared RedoRecPtr so that the startup process can calculate
8208          * the number of segments replayed since last restartpoint, and request a
8209          * restartpoint if it exceeds checkpoint_segments.
8210          *
8211          * Like in CreateCheckPoint(), hold off insertions to update it, although
8212          * during recovery this is just pro forma, because no WAL insertions are
8213          * happening.
8214          */
8215         WALInsertLockAcquireExclusive();
8216         XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
8217         WALInsertLockRelease();
8218
8219         /* Also update the info_lck-protected copy */
8220         SpinLockAcquire(&XLogCtl->info_lck);
8221         XLogCtl->RedoRecPtr = lastCheckPoint.redo;
8222         SpinLockRelease(&XLogCtl->info_lck);
8223
8224         /*
8225          * Prepare to accumulate statistics.
8226          *
8227          * Note: because it is possible for log_checkpoints to change while a
8228          * checkpoint proceeds, we always accumulate stats, even if
8229          * log_checkpoints is currently off.
8230          */
8231         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8232         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8233
8234         if (log_checkpoints)
8235                 LogCheckpointStart(flags, true);
8236
8237         CheckPointGuts(lastCheckPoint.redo, flags);
8238
8239         /*
8240          * Select point at which we can truncate the xlog, which we base on the
8241          * prior checkpoint's earliest info.
8242          */
8243         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8244
8245         /*
8246          * Update pg_control, using current time.  Check that it still shows
8247          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8248          * this is a quick hack to make sure nothing really bad happens if somehow
8249          * we get here after the end-of-recovery checkpoint.
8250          */
8251         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8252         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8253                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
8254         {
8255                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8256                 ControlFile->checkPoint = lastCheckPointRecPtr;
8257                 ControlFile->checkPointCopy = lastCheckPoint;
8258                 ControlFile->time = (pg_time_t) time(NULL);
8259                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8260                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8261                 UpdateControlFile();
8262         }
8263         LWLockRelease(ControlFileLock);
8264
8265         /*
8266          * Delete old log files (those no longer needed even for previous
8267          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8268          * growing full.
8269          */
8270         if (_logSegNo)
8271         {
8272                 XLogRecPtr      receivePtr;
8273                 XLogRecPtr      replayPtr;
8274                 TimeLineID      replayTLI;
8275                 XLogRecPtr      endptr;
8276
8277                 /*
8278                  * Get the current end of xlog replayed or received, whichever is
8279                  * later.
8280                  */
8281                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
8282                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
8283                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
8284
8285                 KeepLogSeg(endptr, &_logSegNo);
8286                 _logSegNo--;
8287
8288                 /*
8289                  * Try to recycle segments on a useful timeline. If we've been
8290                  * promoted since the beginning of this restartpoint, use the new
8291                  * timeline chosen at end of recovery (RecoveryInProgress() sets
8292                  * ThisTimeLineID in that case). If we're still in recovery, use the
8293                  * timeline we're currently replaying.
8294                  *
8295                  * There is no guarantee that the WAL segments will be useful on the
8296                  * current timeline; if recovery proceeds to a new timeline right
8297                  * after this, the pre-allocated WAL segments on this timeline will
8298                  * not be used, and will go wasted until recycled on the next
8299                  * restartpoint. We'll live with that.
8300                  */
8301                 if (RecoveryInProgress())
8302                         ThisTimeLineID = replayTLI;
8303
8304                 RemoveOldXlogFiles(_logSegNo, endptr);
8305
8306                 /*
8307                  * Make more log segments if needed.  (Do this after recycling old log
8308                  * segments, since that may supply some of the needed files.)
8309                  */
8310                 PreallocXlogFiles(endptr);
8311
8312                 /*
8313                  * ThisTimeLineID is normally not set when we're still in recovery.
8314                  * However, recycling/preallocating segments above needed
8315                  * ThisTimeLineID to determine which timeline to install the segments
8316                  * on. Reset it now, to restore the normal state of affairs for
8317                  * debugging purposes.
8318                  */
8319                 if (RecoveryInProgress())
8320                         ThisTimeLineID = 0;
8321         }
8322
8323         /*
8324          * Due to an historical accident multixact truncations are not WAL-logged,
8325          * but just performed everytime the mxact horizon is increased. So, unless
8326          * we explicitly execute truncations on a standby it will never clean out
8327          * /pg_multixact which obviously is bad, both because it uses space and
8328          * because we can wrap around into pre-existing data...
8329          *
8330          * We can only do the truncation here, after the UpdateControlFile()
8331          * above, because we've now safely established a restart point.  That
8332          * guarantees we will not need to access those multis.
8333          *
8334          * It's probably worth improving this.
8335          */
8336         TruncateMultiXact();
8337
8338         /*
8339          * Truncate pg_subtrans if possible.  We can throw away all data before
8340          * the oldest XMIN of any running transaction.  No future transaction will
8341          * attempt to reference any pg_subtrans entry older than that (see Asserts
8342          * in subtrans.c).  When hot standby is disabled, though, we mustn't do
8343          * this because StartupSUBTRANS hasn't been called yet.
8344          */
8345         if (EnableHotStandby)
8346                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
8347
8348         /* Real work is done, but log and update before releasing lock. */
8349         LogCheckpointEnd(true);
8350
8351         xtime = GetLatestXTime();
8352         ereport((log_checkpoints ? LOG : DEBUG2),
8353                         (errmsg("recovery restart point at %X/%X",
8354                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
8355                    xtime ? errdetail("last completed transaction was at log time %s",
8356                                                          timestamptz_to_str(xtime)) : 0));
8357
8358         LWLockRelease(CheckpointLock);
8359
8360         /*
8361          * Finally, execute archive_cleanup_command, if any.
8362          */
8363         if (XLogCtl->archiveCleanupCommand[0])
8364                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8365                                                            "archive_cleanup_command",
8366                                                            false);
8367
8368         return true;
8369 }
8370
8371 /*
8372  * Retreat *logSegNo to the last segment that we need to retain because of
8373  * either wal_keep_segments or replication slots.
8374  *
8375  * This is calculated by subtracting wal_keep_segments from the given xlog
8376  * location, recptr and by making sure that that result is below the
8377  * requirement of replication slots.
8378  */
8379 static void
8380 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
8381 {
8382         XLogSegNo       segno;
8383         XLogRecPtr      keep;
8384
8385         XLByteToSeg(recptr, segno);
8386         keep = XLogGetReplicationSlotMinimumLSN();
8387
8388         /* compute limit for wal_keep_segments first */
8389         if (wal_keep_segments > 0)
8390         {
8391                 /* avoid underflow, don't go below 1 */
8392                 if (segno <= wal_keep_segments)
8393                         segno = 1;
8394                 else
8395                         segno = segno - wal_keep_segments;
8396         }
8397
8398         /* then check whether slots limit removal further */
8399         if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
8400         {
8401                 XLogRecPtr      slotSegNo;
8402
8403                 XLByteToSeg(keep, slotSegNo);
8404
8405                 if (slotSegNo <= 0)
8406                         segno = 1;
8407                 else if (slotSegNo < segno)
8408                         segno = slotSegNo;
8409         }
8410
8411         /* don't delete WAL segments newer than the calculated segment */
8412         if (segno < *logSegNo)
8413                 *logSegNo = segno;
8414 }
8415
8416 /*
8417  * Write a NEXTOID log record
8418  */
8419 void
8420 XLogPutNextOid(Oid nextOid)
8421 {
8422         XLogBeginInsert();
8423         XLogRegisterData((char *) (&nextOid), sizeof(Oid));
8424         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
8425
8426         /*
8427          * We need not flush the NEXTOID record immediately, because any of the
8428          * just-allocated OIDs could only reach disk as part of a tuple insert or
8429          * update that would have its own XLOG record that must follow the NEXTOID
8430          * record.  Therefore, the standard buffer LSN interlock applied to those
8431          * records will ensure no such OID reaches disk before the NEXTOID record
8432          * does.
8433          *
8434          * Note, however, that the above statement only covers state "within" the
8435          * database.  When we use a generated OID as a file or directory name, we
8436          * are in a sense violating the basic WAL rule, because that filesystem
8437          * change may reach disk before the NEXTOID WAL record does.  The impact
8438          * of this is that if a database crash occurs immediately afterward, we
8439          * might after restart re-generate the same OID and find that it conflicts
8440          * with the leftover file or directory.  But since for safety's sake we
8441          * always loop until finding a nonconflicting filename, this poses no real
8442          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8443          */
8444 }
8445
8446 /*
8447  * Write an XLOG SWITCH record.
8448  *
8449  * Here we just blindly issue an XLogInsert request for the record.
8450  * All the magic happens inside XLogInsert.
8451  *
8452  * The return value is either the end+1 address of the switch record,
8453  * or the end+1 address of the prior segment if we did not need to
8454  * write a switch record because we are already at segment start.
8455  */
8456 XLogRecPtr
8457 RequestXLogSwitch(void)
8458 {
8459         XLogRecPtr      RecPtr;
8460
8461         /* XLOG SWITCH has no data */
8462         XLogBeginInsert();
8463         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
8464
8465         return RecPtr;
8466 }
8467
8468 /*
8469  * Write a RESTORE POINT record
8470  */
8471 XLogRecPtr
8472 XLogRestorePoint(const char *rpName)
8473 {
8474         XLogRecPtr      RecPtr;
8475         xl_restore_point xlrec;
8476
8477         xlrec.rp_time = GetCurrentTimestamp();
8478         strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8479
8480         XLogBeginInsert();
8481         XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
8482
8483         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
8484
8485         ereport(LOG,
8486                         (errmsg("restore point \"%s\" created at %X/%X",
8487                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
8488
8489         return RecPtr;
8490 }
8491
8492 /*
8493  * Check if any of the GUC parameters that are critical for hot standby
8494  * have changed, and update the value in pg_control file if necessary.
8495  */
8496 static void
8497 XLogReportParameters(void)
8498 {
8499         if (wal_level != ControlFile->wal_level ||
8500                 wal_log_hints != ControlFile->wal_log_hints ||
8501                 MaxConnections != ControlFile->MaxConnections ||
8502                 max_worker_processes != ControlFile->max_worker_processes ||
8503                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
8504                 max_locks_per_xact != ControlFile->max_locks_per_xact ||
8505                 track_commit_timestamp != ControlFile->track_commit_timestamp)
8506         {
8507                 /*
8508                  * The change in number of backend slots doesn't need to be WAL-logged
8509                  * if archiving is not enabled, as you can't start archive recovery
8510                  * with wal_level=minimal anyway. We don't really care about the
8511                  * values in pg_control either if wal_level=minimal, but seems better
8512                  * to keep them up-to-date to avoid confusion.
8513                  */
8514                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8515                 {
8516                         xl_parameter_change xlrec;
8517                         XLogRecPtr      recptr;
8518
8519                         xlrec.MaxConnections = MaxConnections;
8520                         xlrec.max_worker_processes = max_worker_processes;
8521                         xlrec.max_prepared_xacts = max_prepared_xacts;
8522                         xlrec.max_locks_per_xact = max_locks_per_xact;
8523                         xlrec.wal_level = wal_level;
8524                         xlrec.wal_log_hints = wal_log_hints;
8525                         xlrec.track_commit_timestamp = track_commit_timestamp;
8526
8527                         XLogBeginInsert();
8528                         XLogRegisterData((char *) &xlrec, sizeof(xlrec));
8529
8530                         recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
8531                         XLogFlush(recptr);
8532                 }
8533
8534                 ControlFile->MaxConnections = MaxConnections;
8535                 ControlFile->max_worker_processes = max_worker_processes;
8536                 ControlFile->max_prepared_xacts = max_prepared_xacts;
8537                 ControlFile->max_locks_per_xact = max_locks_per_xact;
8538                 ControlFile->wal_level = wal_level;
8539                 ControlFile->wal_log_hints = wal_log_hints;
8540                 ControlFile->track_commit_timestamp = track_commit_timestamp;
8541                 UpdateControlFile();
8542         }
8543 }
8544
8545 /*
8546  * Update full_page_writes in shared memory, and write an
8547  * XLOG_FPW_CHANGE record if necessary.
8548  *
8549  * Note: this function assumes there is no other process running
8550  * concurrently that could update it.
8551  */
8552 void
8553 UpdateFullPageWrites(void)
8554 {
8555         XLogCtlInsert *Insert = &XLogCtl->Insert;
8556
8557         /*
8558          * Do nothing if full_page_writes has not been changed.
8559          *
8560          * It's safe to check the shared full_page_writes without the lock,
8561          * because we assume that there is no concurrently running process which
8562          * can update it.
8563          */
8564         if (fullPageWrites == Insert->fullPageWrites)
8565                 return;
8566
8567         START_CRIT_SECTION();
8568
8569         /*
8570          * It's always safe to take full page images, even when not strictly
8571          * required, but not the other round. So if we're setting full_page_writes
8572          * to true, first set it true and then write the WAL record. If we're
8573          * setting it to false, first write the WAL record and then set the global
8574          * flag.
8575          */
8576         if (fullPageWrites)
8577         {
8578                 WALInsertLockAcquireExclusive();
8579                 Insert->fullPageWrites = true;
8580                 WALInsertLockRelease();
8581         }
8582
8583         /*
8584          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
8585          * full_page_writes during archive recovery, if required.
8586          */
8587         if (XLogStandbyInfoActive() && !RecoveryInProgress())
8588         {
8589                 XLogBeginInsert();
8590                 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
8591
8592                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
8593         }
8594
8595         if (!fullPageWrites)
8596         {
8597                 WALInsertLockAcquireExclusive();
8598                 Insert->fullPageWrites = false;
8599                 WALInsertLockRelease();
8600         }
8601         END_CRIT_SECTION();
8602 }
8603
8604 /*
8605  * Check that it's OK to switch to new timeline during recovery.
8606  *
8607  * 'lsn' is the address of the shutdown checkpoint record we're about to
8608  * replay. (Currently, timeline can only change at a shutdown checkpoint).
8609  */
8610 static void
8611 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
8612 {
8613         /* Check that the record agrees on what the current (old) timeline is */
8614         if (prevTLI != ThisTimeLineID)
8615                 ereport(PANIC,
8616                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
8617                                                 prevTLI, ThisTimeLineID)));
8618
8619         /*
8620          * The new timeline better be in the list of timelines we expect to see,
8621          * according to the timeline history. It should also not decrease.
8622          */
8623         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
8624                 ereport(PANIC,
8625                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
8626                                  newTLI, ThisTimeLineID)));
8627
8628         /*
8629          * If we have not yet reached min recovery point, and we're about to
8630          * switch to a timeline greater than the timeline of the min recovery
8631          * point: trouble. After switching to the new timeline, we could not
8632          * possibly visit the min recovery point on the correct timeline anymore.
8633          * This can happen if there is a newer timeline in the archive that
8634          * branched before the timeline the min recovery point is on, and you
8635          * attempt to do PITR to the new timeline.
8636          */
8637         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
8638                 lsn < minRecoveryPoint &&
8639                 newTLI > minRecoveryPointTLI)
8640                 ereport(PANIC,
8641                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
8642                                                 newTLI,
8643                                                 (uint32) (minRecoveryPoint >> 32),
8644                                                 (uint32) minRecoveryPoint,
8645                                                 minRecoveryPointTLI)));
8646
8647         /* Looks good */
8648 }
8649
8650 /*
8651  * XLOG resource manager's routines
8652  *
8653  * Definitions of info values are in include/catalog/pg_control.h, though
8654  * not all record types are related to control file updates.
8655  */
8656 void
8657 xlog_redo(XLogReaderState *record)
8658 {
8659         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8660         XLogRecPtr      lsn = record->EndRecPtr;
8661
8662         /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
8663         Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
8664                    !XLogRecHasAnyBlockRefs(record));
8665
8666         if (info == XLOG_NEXTOID)
8667         {
8668                 Oid                     nextOid;
8669
8670                 /*
8671                  * We used to try to take the maximum of ShmemVariableCache->nextOid
8672                  * and the recorded nextOid, but that fails if the OID counter wraps
8673                  * around.  Since no OID allocation should be happening during replay
8674                  * anyway, better to just believe the record exactly.  We still take
8675                  * OidGenLock while setting the variable, just in case.
8676                  */
8677                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
8678                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8679                 ShmemVariableCache->nextOid = nextOid;
8680                 ShmemVariableCache->oidCount = 0;
8681                 LWLockRelease(OidGenLock);
8682         }
8683         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
8684         {
8685                 CheckPoint      checkPoint;
8686
8687                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8688                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
8689                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8690                 ShmemVariableCache->nextXid = checkPoint.nextXid;
8691                 LWLockRelease(XidGenLock);
8692                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8693                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8694                 ShmemVariableCache->oidCount = 0;
8695                 LWLockRelease(OidGenLock);
8696                 MultiXactSetNextMXact(checkPoint.nextMulti,
8697                                                           checkPoint.nextMultiOffset);
8698                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
8699                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
8700                 MultiXactSetSafeTruncate(checkPoint.oldestMulti);
8701
8702                 /*
8703                  * If we see a shutdown checkpoint while waiting for an end-of-backup
8704                  * record, the backup was canceled and the end-of-backup record will
8705                  * never arrive.
8706                  */
8707                 if (ArchiveRecoveryRequested &&
8708                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
8709                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
8710                         ereport(PANIC,
8711                         (errmsg("online backup was canceled, recovery cannot continue")));
8712
8713                 /*
8714                  * If we see a shutdown checkpoint, we know that nothing was running
8715                  * on the master at this point. So fake-up an empty running-xacts
8716                  * record and use that here and now. Recover additional standby state
8717                  * for prepared transactions.
8718                  */
8719                 if (standbyState >= STANDBY_INITIALIZED)
8720                 {
8721                         TransactionId *xids;
8722                         int                     nxids;
8723                         TransactionId oldestActiveXID;
8724                         TransactionId latestCompletedXid;
8725                         RunningTransactionsData running;
8726
8727                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
8728
8729                         /*
8730                          * Construct a RunningTransactions snapshot representing a shut
8731                          * down server, with only prepared transactions still alive. We're
8732                          * never overflowed at this point because all subxids are listed
8733                          * with their parent prepared transactions.
8734                          */
8735                         running.xcnt = nxids;
8736                         running.subxcnt = 0;
8737                         running.subxid_overflow = false;
8738                         running.nextXid = checkPoint.nextXid;
8739                         running.oldestRunningXid = oldestActiveXID;
8740                         latestCompletedXid = checkPoint.nextXid;
8741                         TransactionIdRetreat(latestCompletedXid);
8742                         Assert(TransactionIdIsNormal(latestCompletedXid));
8743                         running.latestCompletedXid = latestCompletedXid;
8744                         running.xids = xids;
8745
8746                         ProcArrayApplyRecoveryInfo(&running);
8747
8748                         StandbyRecoverPreparedTransactions(true);
8749                 }
8750
8751                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8752                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8753                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8754
8755                 /* Update shared-memory copy of checkpoint XID/epoch */
8756                 SpinLockAcquire(&XLogCtl->info_lck);
8757                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
8758                 XLogCtl->ckptXid = checkPoint.nextXid;
8759                 SpinLockRelease(&XLogCtl->info_lck);
8760
8761                 /*
8762                  * We should've already switched to the new TLI before replaying this
8763                  * record.
8764                  */
8765                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8766                         ereport(PANIC,
8767                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8768                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8769
8770                 RecoveryRestartPoint(&checkPoint);
8771         }
8772         else if (info == XLOG_CHECKPOINT_ONLINE)
8773         {
8774                 CheckPoint      checkPoint;
8775
8776                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8777                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
8778                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8779                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
8780                                                                   checkPoint.nextXid))
8781                         ShmemVariableCache->nextXid = checkPoint.nextXid;
8782                 LWLockRelease(XidGenLock);
8783                 /* ... but still treat OID counter as exact */
8784                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8785                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8786                 ShmemVariableCache->oidCount = 0;
8787                 LWLockRelease(OidGenLock);
8788                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
8789                                                                   checkPoint.nextMultiOffset);
8790                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
8791                                                                   checkPoint.oldestXid))
8792                         SetTransactionIdLimit(checkPoint.oldestXid,
8793                                                                   checkPoint.oldestXidDB);
8794                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
8795                                                            checkPoint.oldestMultiDB);
8796                 MultiXactSetSafeTruncate(checkPoint.oldestMulti);
8797
8798                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8799                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8800                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8801
8802                 /* Update shared-memory copy of checkpoint XID/epoch */
8803                 SpinLockAcquire(&XLogCtl->info_lck);
8804                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
8805                 XLogCtl->ckptXid = checkPoint.nextXid;
8806                 SpinLockRelease(&XLogCtl->info_lck);
8807
8808                 /* TLI should not change in an on-line checkpoint */
8809                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8810                         ereport(PANIC,
8811                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8812                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8813
8814                 RecoveryRestartPoint(&checkPoint);
8815         }
8816         else if (info == XLOG_END_OF_RECOVERY)
8817         {
8818                 xl_end_of_recovery xlrec;
8819
8820                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
8821
8822                 /*
8823                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
8824                  * but this case is rarer and harder to test, so the benefit doesn't
8825                  * outweigh the potential extra cost of maintenance.
8826                  */
8827
8828                 /*
8829                  * We should've already switched to the new TLI before replaying this
8830                  * record.
8831                  */
8832                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
8833                         ereport(PANIC,
8834                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8835                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
8836         }
8837         else if (info == XLOG_NOOP)
8838         {
8839                 /* nothing to do here */
8840         }
8841         else if (info == XLOG_SWITCH)
8842         {
8843                 /* nothing to do here */
8844         }
8845         else if (info == XLOG_RESTORE_POINT)
8846         {
8847                 /* nothing to do here */
8848         }
8849         else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
8850         {
8851                 Buffer          buffer;
8852
8853                 /*
8854                  * Full-page image (FPI) records contain nothing else but a backup
8855                  * block. The block reference must include a full-page image -
8856                  * otherwise there would be no point in this record.
8857                  *
8858                  * No recovery conflicts are generated by these generic records - if a
8859                  * resource manager needs to generate conflicts, it has to define a
8860                  * separate WAL record type and redo routine.
8861                  *
8862                  * XLOG_FPI_FOR_HINT records are generated when a page needs to be
8863                  * WAL- logged because of a hint bit update. They are only generated
8864                  * when checksums are enabled. There is no difference in handling
8865                  * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
8866                  * code just to distinguish them for statistics purposes.
8867                  */
8868                 if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
8869                         elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
8870                 UnlockReleaseBuffer(buffer);
8871         }
8872         else if (info == XLOG_BACKUP_END)
8873         {
8874                 XLogRecPtr      startpoint;
8875
8876                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
8877
8878                 if (ControlFile->backupStartPoint == startpoint)
8879                 {
8880                         /*
8881                          * We have reached the end of base backup, the point where
8882                          * pg_stop_backup() was done. The data on disk is now consistent.
8883                          * Reset backupStartPoint, and update minRecoveryPoint to make
8884                          * sure we don't allow starting up at an earlier point even if
8885                          * recovery is stopped and restarted soon after this.
8886                          */
8887                         elog(DEBUG1, "end of backup reached");
8888
8889                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8890
8891                         if (ControlFile->minRecoveryPoint < lsn)
8892                         {
8893                                 ControlFile->minRecoveryPoint = lsn;
8894                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8895                         }
8896                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
8897                         ControlFile->backupEndRequired = false;
8898                         UpdateControlFile();
8899
8900                         LWLockRelease(ControlFileLock);
8901                 }
8902         }
8903         else if (info == XLOG_PARAMETER_CHANGE)
8904         {
8905                 xl_parameter_change xlrec;
8906
8907                 /* Update our copy of the parameters in pg_control */
8908                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8909
8910                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8911                 ControlFile->MaxConnections = xlrec.MaxConnections;
8912                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
8913                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8914                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8915                 ControlFile->wal_level = xlrec.wal_level;
8916                 ControlFile->wal_log_hints = wal_log_hints;
8917                 ControlFile->track_commit_timestamp = track_commit_timestamp;
8918
8919                 /*
8920                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
8921                  * recover back up to this point before allowing hot standby again.
8922                  * This is particularly important if wal_level was set to 'archive'
8923                  * before, and is now 'hot_standby', to ensure you don't run queries
8924                  * against the WAL preceding the wal_level change. Same applies to
8925                  * decreasing max_* settings.
8926                  */
8927                 minRecoveryPoint = ControlFile->minRecoveryPoint;
8928                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
8929                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
8930                 {
8931                         ControlFile->minRecoveryPoint = lsn;
8932                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8933                 }
8934
8935                 UpdateControlFile();
8936                 LWLockRelease(ControlFileLock);
8937
8938                 /* Check to see if any changes to max_connections give problems */
8939                 CheckRequiredParameterValues();
8940         }
8941         else if (info == XLOG_FPW_CHANGE)
8942         {
8943                 bool            fpw;
8944
8945                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8946
8947                 /*
8948                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
8949                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
8950                  * full_page_writes has been disabled during online backup.
8951                  */
8952                 if (!fpw)
8953                 {
8954                         SpinLockAcquire(&XLogCtl->info_lck);
8955                         if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
8956                                 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
8957                         SpinLockRelease(&XLogCtl->info_lck);
8958                 }
8959
8960                 /* Keep track of full_page_writes */
8961                 lastFullPageWrites = fpw;
8962         }
8963 }
8964
8965 #ifdef WAL_DEBUG
8966
8967 static void
8968 xlog_outrec(StringInfo buf, XLogReaderState *record)
8969 {
8970         int                     block_id;
8971
8972         appendStringInfo(buf, "prev %X/%X; xid %u",
8973                                          (uint32) (XLogRecGetPrev(record) >> 32),
8974                                          (uint32) XLogRecGetPrev(record),
8975                                          XLogRecGetXid(record));
8976
8977         appendStringInfo(buf, "; len %u",
8978                                          XLogRecGetDataLen(record));
8979
8980         /* decode block references */
8981         for (block_id = 0; block_id <= record->max_block_id; block_id++)
8982         {
8983                 RelFileNode rnode;
8984                 ForkNumber      forknum;
8985                 BlockNumber blk;
8986
8987                 if (!XLogRecHasBlockRef(record, block_id))
8988                         continue;
8989
8990                 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
8991                 if (forknum != MAIN_FORKNUM)
8992                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
8993                                                          block_id,
8994                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
8995                                                          forknum,
8996                                                          blk);
8997                 else
8998                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
8999                                                          block_id,
9000                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
9001                                                          blk);
9002                 if (XLogRecHasBlockImage(record, block_id))
9003                         appendStringInfo(buf, " FPW");
9004         }
9005 }
9006 #endif   /* WAL_DEBUG */
9007
9008 /*
9009  * Returns a string describing an XLogRecord, consisting of its identity
9010  * optionally followed by a colon, a space, and a further description.
9011  */
9012 static void
9013 xlog_outdesc(StringInfo buf, XLogReaderState *record)
9014 {
9015         RmgrId          rmid = XLogRecGetRmid(record);
9016         uint8           info = XLogRecGetInfo(record);
9017         const char *id;
9018
9019         appendStringInfoString(buf, RmgrTable[rmid].rm_name);
9020         appendStringInfoChar(buf, '/');
9021
9022         id = RmgrTable[rmid].rm_identify(info);
9023         if (id == NULL)
9024                 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
9025         else
9026                 appendStringInfo(buf, "%s: ", id);
9027
9028         RmgrTable[rmid].rm_desc(buf, record);
9029 }
9030
9031
9032 /*
9033  * Return the (possible) sync flag used for opening a file, depending on the
9034  * value of the GUC wal_sync_method.
9035  */
9036 static int
9037 get_sync_bit(int method)
9038 {
9039         int                     o_direct_flag = 0;
9040
9041         /* If fsync is disabled, never open in sync mode */
9042         if (!enableFsync)
9043                 return 0;
9044
9045         /*
9046          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9047          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9048          * disabled, otherwise the archive command or walsender process will read
9049          * the WAL soon after writing it, which is guaranteed to cause a physical
9050          * read if we bypassed the kernel cache. We also skip the
9051          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9052          * reason.
9053          *
9054          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9055          * written by walreceiver is normally read by the startup process soon
9056          * after its written. Also, walreceiver performs unaligned writes, which
9057          * don't work with O_DIRECT, so it is required for correctness too.
9058          */
9059         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9060                 o_direct_flag = PG_O_DIRECT;
9061
9062         switch (method)
9063         {
9064                         /*
9065                          * enum values for all sync options are defined even if they are
9066                          * not supported on the current platform.  But if not, they are
9067                          * not included in the enum option array, and therefore will never
9068                          * be seen here.
9069                          */
9070                 case SYNC_METHOD_FSYNC:
9071                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9072                 case SYNC_METHOD_FDATASYNC:
9073                         return 0;
9074 #ifdef OPEN_SYNC_FLAG
9075                 case SYNC_METHOD_OPEN:
9076                         return OPEN_SYNC_FLAG | o_direct_flag;
9077 #endif
9078 #ifdef OPEN_DATASYNC_FLAG
9079                 case SYNC_METHOD_OPEN_DSYNC:
9080                         return OPEN_DATASYNC_FLAG | o_direct_flag;
9081 #endif
9082                 default:
9083                         /* can't happen (unless we are out of sync with option array) */
9084                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
9085                         return 0;                       /* silence warning */
9086         }
9087 }
9088
9089 /*
9090  * GUC support
9091  */
9092 void
9093 assign_xlog_sync_method(int new_sync_method, void *extra)
9094 {
9095         if (sync_method != new_sync_method)
9096         {
9097                 /*
9098                  * To ensure that no blocks escape unsynced, force an fsync on the
9099                  * currently open log segment (if any).  Also, if the open flag is
9100                  * changing, close the log file so it will be reopened (with new flag
9101                  * bit) at next use.
9102                  */
9103                 if (openLogFile >= 0)
9104                 {
9105                         if (pg_fsync(openLogFile) != 0)
9106                                 ereport(PANIC,
9107                                                 (errcode_for_file_access(),
9108                                                  errmsg("could not fsync log segment %s: %m",
9109                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
9110                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9111                                 XLogFileClose();
9112                 }
9113         }
9114 }
9115
9116
9117 /*
9118  * Issue appropriate kind of fsync (if any) for an XLOG output file.
9119  *
9120  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9121  * 'log' and 'seg' are for error reporting purposes.
9122  */
9123 void
9124 issue_xlog_fsync(int fd, XLogSegNo segno)
9125 {
9126         switch (sync_method)
9127         {
9128                 case SYNC_METHOD_FSYNC:
9129                         if (pg_fsync_no_writethrough(fd) != 0)
9130                                 ereport(PANIC,
9131                                                 (errcode_for_file_access(),
9132                                                  errmsg("could not fsync log file %s: %m",
9133                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9134                         break;
9135 #ifdef HAVE_FSYNC_WRITETHROUGH
9136                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9137                         if (pg_fsync_writethrough(fd) != 0)
9138                                 ereport(PANIC,
9139                                                 (errcode_for_file_access(),
9140                                           errmsg("could not fsync write-through log file %s: %m",
9141                                                          XLogFileNameP(ThisTimeLineID, segno))));
9142                         break;
9143 #endif
9144 #ifdef HAVE_FDATASYNC
9145                 case SYNC_METHOD_FDATASYNC:
9146                         if (pg_fdatasync(fd) != 0)
9147                                 ereport(PANIC,
9148                                                 (errcode_for_file_access(),
9149                                                  errmsg("could not fdatasync log file %s: %m",
9150                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9151                         break;
9152 #endif
9153                 case SYNC_METHOD_OPEN:
9154                 case SYNC_METHOD_OPEN_DSYNC:
9155                         /* write synced it already */
9156                         break;
9157                 default:
9158                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9159                         break;
9160         }
9161 }
9162
9163 /*
9164  * Return the filename of given log segment, as a palloc'd string.
9165  */
9166 char *
9167 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
9168 {
9169         char       *result = palloc(MAXFNAMELEN);
9170
9171         XLogFileName(result, tli, segno);
9172         return result;
9173 }
9174
9175 /*
9176  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9177  * function. It creates the necessary starting checkpoint and constructs the
9178  * backup label file.
9179  *
9180  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9181  * backup is started with pg_start_backup(), and there can be only one active
9182  * at a time. The backup label file of an exclusive backup is written to
9183  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9184  *
9185  * A non-exclusive backup is used for the streaming base backups (see
9186  * src/backend/replication/basebackup.c). The difference to exclusive backups
9187  * is that the backup label file is not written to disk. Instead, its would-be
9188  * contents are returned in *labelfile, and the caller is responsible for
9189  * including it in the backup archive as 'backup_label'. There can be many
9190  * non-exclusive backups active at the same time, and they don't conflict
9191  * with an exclusive backup either.
9192  *
9193  * Returns the minimum WAL position that must be present to restore from this
9194  * backup, and the corresponding timeline ID in *starttli_p.
9195  *
9196  * Every successfully started non-exclusive backup must be stopped by calling
9197  * do_pg_stop_backup() or do_pg_abort_backup().
9198  *
9199  * It is the responsibility of the caller of this function to verify the
9200  * permissions of the calling user!
9201  */
9202 XLogRecPtr
9203 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
9204                                    char **labelfile)
9205 {
9206         bool            exclusive = (labelfile == NULL);
9207         bool            backup_started_in_recovery = false;
9208         XLogRecPtr      checkpointloc;
9209         XLogRecPtr      startpoint;
9210         TimeLineID      starttli;
9211         pg_time_t       stamp_time;
9212         char            strfbuf[128];
9213         char            xlogfilename[MAXFNAMELEN];
9214         XLogSegNo       _logSegNo;
9215         struct stat stat_buf;
9216         FILE       *fp;
9217         StringInfoData labelfbuf;
9218
9219         backup_started_in_recovery = RecoveryInProgress();
9220
9221         /*
9222          * Currently only non-exclusive backup can be taken during recovery.
9223          */
9224         if (backup_started_in_recovery && exclusive)
9225                 ereport(ERROR,
9226                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9227                                  errmsg("recovery is in progress"),
9228                                  errhint("WAL control functions cannot be executed during recovery.")));
9229
9230         /*
9231          * During recovery, we don't need to check WAL level. Because, if WAL
9232          * level is not sufficient, it's impossible to get here during recovery.
9233          */
9234         if (!backup_started_in_recovery && !XLogIsNeeded())
9235                 ereport(ERROR,
9236                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9237                           errmsg("WAL level not sufficient for making an online backup"),
9238                                  errhint("wal_level must be set to \"archive\", \"hot_standby\", or \"logical\" at server start.")));
9239
9240         if (strlen(backupidstr) > MAXPGPATH)
9241                 ereport(ERROR,
9242                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9243                                  errmsg("backup label too long (max %d bytes)",
9244                                                 MAXPGPATH)));
9245
9246         /*
9247          * Mark backup active in shared memory.  We must do full-page WAL writes
9248          * during an on-line backup even if not doing so at other times, because
9249          * it's quite possible for the backup dump to obtain a "torn" (partially
9250          * written) copy of a database page if it reads the page concurrently with
9251          * our write to the same page.  This can be fixed as long as the first
9252          * write to the page in the WAL sequence is a full-page write. Hence, we
9253          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9254          * are no dirty pages in shared memory that might get dumped while the
9255          * backup is in progress without having a corresponding WAL record.  (Once
9256          * the backup is complete, we need not force full-page writes anymore,
9257          * since we expect that any pages not modified during the backup interval
9258          * must have been correctly captured by the backup.)
9259          *
9260          * Note that forcePageWrites has no effect during an online backup from
9261          * the standby.
9262          *
9263          * We must hold all the insertion locks to change the value of
9264          * forcePageWrites, to ensure adequate interlocking against
9265          * XLogInsertRecord().
9266          */
9267         WALInsertLockAcquireExclusive();
9268         if (exclusive)
9269         {
9270                 if (XLogCtl->Insert.exclusiveBackup)
9271                 {
9272                         WALInsertLockRelease();
9273                         ereport(ERROR,
9274                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9275                                          errmsg("a backup is already in progress"),
9276                                          errhint("Run pg_stop_backup() and try again.")));
9277                 }
9278                 XLogCtl->Insert.exclusiveBackup = true;
9279         }
9280         else
9281                 XLogCtl->Insert.nonExclusiveBackups++;
9282         XLogCtl->Insert.forcePageWrites = true;
9283         WALInsertLockRelease();
9284
9285         /* Ensure we release forcePageWrites if fail below */
9286         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9287         {
9288                 bool            gotUniqueStartpoint = false;
9289
9290                 /*
9291                  * Force an XLOG file switch before the checkpoint, to ensure that the
9292                  * WAL segment the checkpoint is written to doesn't contain pages with
9293                  * old timeline IDs.  That would otherwise happen if you called
9294                  * pg_start_backup() right after restoring from a PITR archive: the
9295                  * first WAL segment containing the startup checkpoint has pages in
9296                  * the beginning with the old timeline ID.  That can cause trouble at
9297                  * recovery: we won't have a history file covering the old timeline if
9298                  * pg_xlog directory was not included in the base backup and the WAL
9299                  * archive was cleared too before starting the backup.
9300                  *
9301                  * This also ensures that we have emitted a WAL page header that has
9302                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9303                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
9304                  * compress out removable backup blocks, it won't remove any that
9305                  * occur after this point.
9306                  *
9307                  * During recovery, we skip forcing XLOG file switch, which means that
9308                  * the backup taken during recovery is not available for the special
9309                  * recovery case described above.
9310                  */
9311                 if (!backup_started_in_recovery)
9312                         RequestXLogSwitch();
9313
9314                 do
9315                 {
9316                         bool            checkpointfpw;
9317
9318                         /*
9319                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
9320                          * page problems, this guarantees that two successive backup runs
9321                          * will have different checkpoint positions and hence different
9322                          * history file names, even if nothing happened in between.
9323                          *
9324                          * During recovery, establish a restartpoint if possible. We use
9325                          * the last restartpoint as the backup starting checkpoint. This
9326                          * means that two successive backup runs can have same checkpoint
9327                          * positions.
9328                          *
9329                          * Since the fact that we are executing do_pg_start_backup()
9330                          * during recovery means that checkpointer is running, we can use
9331                          * RequestCheckpoint() to establish a restartpoint.
9332                          *
9333                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
9334                          * passing fast = true).  Otherwise this can take awhile.
9335                          */
9336                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9337                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
9338
9339                         /*
9340                          * Now we need to fetch the checkpoint record location, and also
9341                          * its REDO pointer.  The oldest point in WAL that would be needed
9342                          * to restore starting from the checkpoint is precisely the REDO
9343                          * pointer.
9344                          */
9345                         LWLockAcquire(ControlFileLock, LW_SHARED);
9346                         checkpointloc = ControlFile->checkPoint;
9347                         startpoint = ControlFile->checkPointCopy.redo;
9348                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
9349                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9350                         LWLockRelease(ControlFileLock);
9351
9352                         if (backup_started_in_recovery)
9353                         {
9354                                 XLogRecPtr      recptr;
9355
9356                                 /*
9357                                  * Check to see if all WAL replayed during online backup
9358                                  * (i.e., since last restartpoint used as backup starting
9359                                  * checkpoint) contain full-page writes.
9360                                  */
9361                                 SpinLockAcquire(&XLogCtl->info_lck);
9362                                 recptr = XLogCtl->lastFpwDisableRecPtr;
9363                                 SpinLockRelease(&XLogCtl->info_lck);
9364
9365                                 if (!checkpointfpw || startpoint <= recptr)
9366                                         ereport(ERROR,
9367                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9368                                                    errmsg("WAL generated with full_page_writes=off was replayed "
9369                                                                   "since last restartpoint"),
9370                                                    errhint("This means that the backup being taken on the standby "
9371                                                                    "is corrupt and should not be used. "
9372                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
9373                                                                    "and then try an online backup again.")));
9374
9375                                 /*
9376                                  * During recovery, since we don't use the end-of-backup WAL
9377                                  * record and don't write the backup history file, the
9378                                  * starting WAL location doesn't need to be unique. This means
9379                                  * that two base backups started at the same time might use
9380                                  * the same checkpoint as starting locations.
9381                                  */
9382                                 gotUniqueStartpoint = true;
9383                         }
9384
9385                         /*
9386                          * If two base backups are started at the same time (in WAL sender
9387                          * processes), we need to make sure that they use different
9388                          * checkpoints as starting locations, because we use the starting
9389                          * WAL location as a unique identifier for the base backup in the
9390                          * end-of-backup WAL record and when we write the backup history
9391                          * file. Perhaps it would be better generate a separate unique ID
9392                          * for each backup instead of forcing another checkpoint, but
9393                          * taking a checkpoint right after another is not that expensive
9394                          * either because only few buffers have been dirtied yet.
9395                          */
9396                         WALInsertLockAcquireExclusive();
9397                         if (XLogCtl->Insert.lastBackupStart < startpoint)
9398                         {
9399                                 XLogCtl->Insert.lastBackupStart = startpoint;
9400                                 gotUniqueStartpoint = true;
9401                         }
9402                         WALInsertLockRelease();
9403                 } while (!gotUniqueStartpoint);
9404
9405                 XLByteToSeg(startpoint, _logSegNo);
9406                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
9407
9408                 /*
9409                  * Construct backup label file
9410                  */
9411                 initStringInfo(&labelfbuf);
9412
9413                 /* Use the log timezone here, not the session timezone */
9414                 stamp_time = (pg_time_t) time(NULL);
9415                 pg_strftime(strfbuf, sizeof(strfbuf),
9416                                         "%Y-%m-%d %H:%M:%S %Z",
9417                                         pg_localtime(&stamp_time, log_timezone));
9418                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
9419                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
9420                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
9421                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
9422                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
9423                                                  exclusive ? "pg_start_backup" : "streamed");
9424                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
9425                                                  backup_started_in_recovery ? "standby" : "master");
9426                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
9427                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
9428
9429                 /*
9430                  * Okay, write the file, or return its contents to caller.
9431                  */
9432                 if (exclusive)
9433                 {
9434                         /*
9435                          * Check for existing backup label --- implies a backup is already
9436                          * running.  (XXX given that we checked exclusiveBackup above,
9437                          * maybe it would be OK to just unlink any such label file?)
9438                          */
9439                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
9440                         {
9441                                 if (errno != ENOENT)
9442                                         ereport(ERROR,
9443                                                         (errcode_for_file_access(),
9444                                                          errmsg("could not stat file \"%s\": %m",
9445                                                                         BACKUP_LABEL_FILE)));
9446                         }
9447                         else
9448                                 ereport(ERROR,
9449                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9450                                                  errmsg("a backup is already in progress"),
9451                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
9452                                                                  BACKUP_LABEL_FILE)));
9453
9454                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
9455
9456                         if (!fp)
9457                                 ereport(ERROR,
9458                                                 (errcode_for_file_access(),
9459                                                  errmsg("could not create file \"%s\": %m",
9460                                                                 BACKUP_LABEL_FILE)));
9461                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
9462                                 fflush(fp) != 0 ||
9463                                 pg_fsync(fileno(fp)) != 0 ||
9464                                 ferror(fp) ||
9465                                 FreeFile(fp))
9466                                 ereport(ERROR,
9467                                                 (errcode_for_file_access(),
9468                                                  errmsg("could not write file \"%s\": %m",
9469                                                                 BACKUP_LABEL_FILE)));
9470                         pfree(labelfbuf.data);
9471                 }
9472                 else
9473                         *labelfile = labelfbuf.data;
9474         }
9475         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9476
9477         /*
9478          * We're done.  As a convenience, return the starting WAL location.
9479          */
9480         if (starttli_p)
9481                 *starttli_p = starttli;
9482         return startpoint;
9483 }
9484
9485 /* Error cleanup callback for pg_start_backup */
9486 static void
9487 pg_start_backup_callback(int code, Datum arg)
9488 {
9489         bool            exclusive = DatumGetBool(arg);
9490
9491         /* Update backup counters and forcePageWrites on failure */
9492         WALInsertLockAcquireExclusive();
9493         if (exclusive)
9494         {
9495                 Assert(XLogCtl->Insert.exclusiveBackup);
9496                 XLogCtl->Insert.exclusiveBackup = false;
9497         }
9498         else
9499         {
9500                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9501                 XLogCtl->Insert.nonExclusiveBackups--;
9502         }
9503
9504         if (!XLogCtl->Insert.exclusiveBackup &&
9505                 XLogCtl->Insert.nonExclusiveBackups == 0)
9506         {
9507                 XLogCtl->Insert.forcePageWrites = false;
9508         }
9509         WALInsertLockRelease();
9510 }
9511
9512 /*
9513  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
9514  * function.
9515
9516  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
9517  * the non-exclusive backup specified by 'labelfile'.
9518  *
9519  * Returns the last WAL position that must be present to restore from this
9520  * backup, and the corresponding timeline ID in *stoptli_p.
9521  *
9522  * It is the responsibility of the caller of this function to verify the
9523  * permissions of the calling user!
9524  */
9525 XLogRecPtr
9526 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
9527 {
9528         bool            exclusive = (labelfile == NULL);
9529         bool            backup_started_in_recovery = false;
9530         XLogRecPtr      startpoint;
9531         XLogRecPtr      stoppoint;
9532         TimeLineID      stoptli;
9533         pg_time_t       stamp_time;
9534         char            strfbuf[128];
9535         char            histfilepath[MAXPGPATH];
9536         char            startxlogfilename[MAXFNAMELEN];
9537         char            stopxlogfilename[MAXFNAMELEN];
9538         char            lastxlogfilename[MAXFNAMELEN];
9539         char            histfilename[MAXFNAMELEN];
9540         char            backupfrom[20];
9541         XLogSegNo       _logSegNo;
9542         FILE       *lfp;
9543         FILE       *fp;
9544         char            ch;
9545         int                     seconds_before_warning;
9546         int                     waits = 0;
9547         bool            reported_waiting = false;
9548         char       *remaining;
9549         char       *ptr;
9550         uint32          hi,
9551                                 lo;
9552
9553         backup_started_in_recovery = RecoveryInProgress();
9554
9555         /*
9556          * Currently only non-exclusive backup can be taken during recovery.
9557          */
9558         if (backup_started_in_recovery && exclusive)
9559                 ereport(ERROR,
9560                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9561                                  errmsg("recovery is in progress"),
9562                                  errhint("WAL control functions cannot be executed during recovery.")));
9563
9564         /*
9565          * During recovery, we don't need to check WAL level. Because, if WAL
9566          * level is not sufficient, it's impossible to get here during recovery.
9567          */
9568         if (!backup_started_in_recovery && !XLogIsNeeded())
9569                 ereport(ERROR,
9570                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9571                           errmsg("WAL level not sufficient for making an online backup"),
9572                                  errhint("wal_level must be set to \"archive\", \"hot_standby\", or \"logical\" at server start.")));
9573
9574         /*
9575          * OK to update backup counters and forcePageWrites
9576          */
9577         WALInsertLockAcquireExclusive();
9578         if (exclusive)
9579                 XLogCtl->Insert.exclusiveBackup = false;
9580         else
9581         {
9582                 /*
9583                  * The user-visible pg_start/stop_backup() functions that operate on
9584                  * exclusive backups can be called at any time, but for non-exclusive
9585                  * backups, it is expected that each do_pg_start_backup() call is
9586                  * matched by exactly one do_pg_stop_backup() call.
9587                  */
9588                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9589                 XLogCtl->Insert.nonExclusiveBackups--;
9590         }
9591
9592         if (!XLogCtl->Insert.exclusiveBackup &&
9593                 XLogCtl->Insert.nonExclusiveBackups == 0)
9594         {
9595                 XLogCtl->Insert.forcePageWrites = false;
9596         }
9597         WALInsertLockRelease();
9598
9599         if (exclusive)
9600         {
9601                 /*
9602                  * Read the existing label file into memory.
9603                  */
9604                 struct stat statbuf;
9605                 int                     r;
9606
9607                 if (stat(BACKUP_LABEL_FILE, &statbuf))
9608                 {
9609                         if (errno != ENOENT)
9610                                 ereport(ERROR,
9611                                                 (errcode_for_file_access(),
9612                                                  errmsg("could not stat file \"%s\": %m",
9613                                                                 BACKUP_LABEL_FILE)));
9614                         ereport(ERROR,
9615                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9616                                          errmsg("a backup is not in progress")));
9617                 }
9618
9619                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9620                 if (!lfp)
9621                 {
9622                         ereport(ERROR,
9623                                         (errcode_for_file_access(),
9624                                          errmsg("could not read file \"%s\": %m",
9625                                                         BACKUP_LABEL_FILE)));
9626                 }
9627                 labelfile = palloc(statbuf.st_size + 1);
9628                 r = fread(labelfile, statbuf.st_size, 1, lfp);
9629                 labelfile[statbuf.st_size] = '\0';
9630
9631                 /*
9632                  * Close and remove the backup label file
9633                  */
9634                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
9635                         ereport(ERROR,
9636                                         (errcode_for_file_access(),
9637                                          errmsg("could not read file \"%s\": %m",
9638                                                         BACKUP_LABEL_FILE)));
9639                 if (unlink(BACKUP_LABEL_FILE) != 0)
9640                         ereport(ERROR,
9641                                         (errcode_for_file_access(),
9642                                          errmsg("could not remove file \"%s\": %m",
9643                                                         BACKUP_LABEL_FILE)));
9644         }
9645
9646         /*
9647          * Read and parse the START WAL LOCATION line (this code is pretty crude,
9648          * but we are not expecting any variability in the file format).
9649          */
9650         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
9651                            &hi, &lo, startxlogfilename,
9652                            &ch) != 4 || ch != '\n')
9653                 ereport(ERROR,
9654                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9655                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9656         startpoint = ((uint64) hi) << 32 | lo;
9657         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
9658
9659         /*
9660          * Parse the BACKUP FROM line. If we are taking an online backup from the
9661          * standby, we confirm that the standby has not been promoted during the
9662          * backup.
9663          */
9664         ptr = strstr(remaining, "BACKUP FROM:");
9665         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
9666                 ereport(ERROR,
9667                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9668                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9669         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
9670                 ereport(ERROR,
9671                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9672                                  errmsg("the standby was promoted during online backup"),
9673                                  errhint("This means that the backup being taken is corrupt "
9674                                                  "and should not be used. "
9675                                                  "Try taking another online backup.")));
9676
9677         /*
9678          * During recovery, we don't write an end-of-backup record. We assume that
9679          * pg_control was backed up last and its minimum recovery point can be
9680          * available as the backup end location. Since we don't have an
9681          * end-of-backup record, we use the pg_control value to check whether
9682          * we've reached the end of backup when starting recovery from this
9683          * backup. We have no way of checking if pg_control wasn't backed up last
9684          * however.
9685          *
9686          * We don't force a switch to new WAL file and wait for all the required
9687          * files to be archived. This is okay if we use the backup to start the
9688          * standby. But, if it's for an archive recovery, to ensure all the
9689          * required files are available, a user should wait for them to be
9690          * archived, or include them into the backup.
9691          *
9692          * We return the current minimum recovery point as the backup end
9693          * location. Note that it can be greater than the exact backup end
9694          * location if the minimum recovery point is updated after the backup of
9695          * pg_control. This is harmless for current uses.
9696          *
9697          * XXX currently a backup history file is for informational and debug
9698          * purposes only. It's not essential for an online backup. Furthermore,
9699          * even if it's created, it will not be archived during recovery because
9700          * an archiver is not invoked. So it doesn't seem worthwhile to write a
9701          * backup history file during recovery.
9702          */
9703         if (backup_started_in_recovery)
9704         {
9705                 XLogRecPtr      recptr;
9706
9707                 /*
9708                  * Check to see if all WAL replayed during online backup contain
9709                  * full-page writes.
9710                  */
9711                 SpinLockAcquire(&XLogCtl->info_lck);
9712                 recptr = XLogCtl->lastFpwDisableRecPtr;
9713                 SpinLockRelease(&XLogCtl->info_lck);
9714
9715                 if (startpoint <= recptr)
9716                         ereport(ERROR,
9717                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9718                            errmsg("WAL generated with full_page_writes=off was replayed "
9719                                           "during online backup"),
9720                          errhint("This means that the backup being taken on the standby "
9721                                          "is corrupt and should not be used. "
9722                                  "Enable full_page_writes and run CHECKPOINT on the master, "
9723                                          "and then try an online backup again.")));
9724
9725
9726                 LWLockAcquire(ControlFileLock, LW_SHARED);
9727                 stoppoint = ControlFile->minRecoveryPoint;
9728                 stoptli = ControlFile->minRecoveryPointTLI;
9729                 LWLockRelease(ControlFileLock);
9730
9731                 if (stoptli_p)
9732                         *stoptli_p = stoptli;
9733                 return stoppoint;
9734         }
9735
9736         /*
9737          * Write the backup-end xlog record
9738          */
9739         XLogBeginInsert();
9740         XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
9741         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
9742         stoptli = ThisTimeLineID;
9743
9744         /*
9745          * Force a switch to a new xlog segment file, so that the backup is valid
9746          * as soon as archiver moves out the current segment file.
9747          */
9748         RequestXLogSwitch();
9749
9750         XLByteToPrevSeg(stoppoint, _logSegNo);
9751         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
9752
9753         /* Use the log timezone here, not the session timezone */
9754         stamp_time = (pg_time_t) time(NULL);
9755         pg_strftime(strfbuf, sizeof(strfbuf),
9756                                 "%Y-%m-%d %H:%M:%S %Z",
9757                                 pg_localtime(&stamp_time, log_timezone));
9758
9759         /*
9760          * Write the backup history file
9761          */
9762         XLByteToSeg(startpoint, _logSegNo);
9763         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
9764                                                   (uint32) (startpoint % XLogSegSize));
9765         fp = AllocateFile(histfilepath, "w");
9766         if (!fp)
9767                 ereport(ERROR,
9768                                 (errcode_for_file_access(),
9769                                  errmsg("could not create file \"%s\": %m",
9770                                                 histfilepath)));
9771         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
9772                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
9773         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
9774                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
9775         /* transfer remaining lines from label to history file */
9776         fprintf(fp, "%s", remaining);
9777         fprintf(fp, "STOP TIME: %s\n", strfbuf);
9778         if (fflush(fp) || ferror(fp) || FreeFile(fp))
9779                 ereport(ERROR,
9780                                 (errcode_for_file_access(),
9781                                  errmsg("could not write file \"%s\": %m",
9782                                                 histfilepath)));
9783
9784         /*
9785          * Clean out any no-longer-needed history files.  As a side effect, this
9786          * will post a .ready file for the newly created history file, notifying
9787          * the archiver that history file may be archived immediately.
9788          */
9789         CleanupBackupHistory();
9790
9791         /*
9792          * If archiving is enabled, wait for all the required WAL files to be
9793          * archived before returning. If archiving isn't enabled, the required WAL
9794          * needs to be transported via streaming replication (hopefully with
9795          * wal_keep_segments set high enough), or some more exotic mechanism like
9796          * polling and copying files from pg_xlog with script. We have no
9797          * knowledge of those mechanisms, so it's up to the user to ensure that he
9798          * gets all the required WAL.
9799          *
9800          * We wait until both the last WAL file filled during backup and the
9801          * history file have been archived, and assume that the alphabetic sorting
9802          * property of the WAL files ensures any earlier WAL files are safely
9803          * archived as well.
9804          *
9805          * We wait forever, since archive_command is supposed to work and we
9806          * assume the admin wanted his backup to work completely. If you don't
9807          * wish to wait, you can set statement_timeout.  Also, some notices are
9808          * issued to clue in anyone who might be doing this interactively.
9809          */
9810         if (waitforarchive && XLogArchivingActive())
9811         {
9812                 XLByteToPrevSeg(stoppoint, _logSegNo);
9813                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
9814
9815                 XLByteToSeg(startpoint, _logSegNo);
9816                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
9817                                                           (uint32) (startpoint % XLogSegSize));
9818
9819                 seconds_before_warning = 60;
9820                 waits = 0;
9821
9822                 while (XLogArchiveIsBusy(lastxlogfilename) ||
9823                            XLogArchiveIsBusy(histfilename))
9824                 {
9825                         CHECK_FOR_INTERRUPTS();
9826
9827                         if (!reported_waiting && waits > 5)
9828                         {
9829                                 ereport(NOTICE,
9830                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
9831                                 reported_waiting = true;
9832                         }
9833
9834                         pg_usleep(1000000L);
9835
9836                         if (++waits >= seconds_before_warning)
9837                         {
9838                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
9839                                 ereport(WARNING,
9840                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
9841                                                                 waits),
9842                                                  errhint("Check that your archive_command is executing properly.  "
9843                                                                  "pg_stop_backup can be canceled safely, "
9844                                                                  "but the database backup will not be usable without all the WAL segments.")));
9845                         }
9846                 }
9847
9848                 ereport(NOTICE,
9849                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
9850         }
9851         else if (waitforarchive)
9852                 ereport(NOTICE,
9853                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
9854
9855         /*
9856          * We're done.  As a convenience, return the ending WAL location.
9857          */
9858         if (stoptli_p)
9859                 *stoptli_p = stoptli;
9860         return stoppoint;
9861 }
9862
9863
9864 /*
9865  * do_pg_abort_backup: abort a running backup
9866  *
9867  * This does just the most basic steps of do_pg_stop_backup(), by taking the
9868  * system out of backup mode, thus making it a lot more safe to call from
9869  * an error handler.
9870  *
9871  * NB: This is only for aborting a non-exclusive backup that doesn't write
9872  * backup_label. A backup started with pg_start_backup() needs to be finished
9873  * with pg_stop_backup().
9874  */
9875 void
9876 do_pg_abort_backup(void)
9877 {
9878         WALInsertLockAcquireExclusive();
9879         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9880         XLogCtl->Insert.nonExclusiveBackups--;
9881
9882         if (!XLogCtl->Insert.exclusiveBackup &&
9883                 XLogCtl->Insert.nonExclusiveBackups == 0)
9884         {
9885                 XLogCtl->Insert.forcePageWrites = false;
9886         }
9887         WALInsertLockRelease();
9888 }
9889
9890 /*
9891  * Get latest redo apply position.
9892  *
9893  * Exported to allow WALReceiver to read the pointer directly.
9894  */
9895 XLogRecPtr
9896 GetXLogReplayRecPtr(TimeLineID *replayTLI)
9897 {
9898         XLogRecPtr      recptr;
9899         TimeLineID      tli;
9900
9901         SpinLockAcquire(&XLogCtl->info_lck);
9902         recptr = XLogCtl->lastReplayedEndRecPtr;
9903         tli = XLogCtl->lastReplayedTLI;
9904         SpinLockRelease(&XLogCtl->info_lck);
9905
9906         if (replayTLI)
9907                 *replayTLI = tli;
9908         return recptr;
9909 }
9910
9911 /*
9912  * Get latest WAL insert pointer
9913  */
9914 XLogRecPtr
9915 GetXLogInsertRecPtr(void)
9916 {
9917         XLogCtlInsert *Insert = &XLogCtl->Insert;
9918         uint64          current_bytepos;
9919
9920         SpinLockAcquire(&Insert->insertpos_lck);
9921         current_bytepos = Insert->CurrBytePos;
9922         SpinLockRelease(&Insert->insertpos_lck);
9923
9924         return XLogBytePosToRecPtr(current_bytepos);
9925 }
9926
9927 /*
9928  * Get latest WAL write pointer
9929  */
9930 XLogRecPtr
9931 GetXLogWriteRecPtr(void)
9932 {
9933         SpinLockAcquire(&XLogCtl->info_lck);
9934         LogwrtResult = XLogCtl->LogwrtResult;
9935         SpinLockRelease(&XLogCtl->info_lck);
9936
9937         return LogwrtResult.Write;
9938 }
9939
9940 /*
9941  * Returns the redo pointer of the last checkpoint or restartpoint. This is
9942  * the oldest point in WAL that we still need, if we have to restart recovery.
9943  */
9944 void
9945 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
9946 {
9947         LWLockAcquire(ControlFileLock, LW_SHARED);
9948         *oldrecptr = ControlFile->checkPointCopy.redo;
9949         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
9950         LWLockRelease(ControlFileLock);
9951 }
9952
9953 /*
9954  * read_backup_label: check to see if a backup_label file is present
9955  *
9956  * If we see a backup_label during recovery, we assume that we are recovering
9957  * from a backup dump file, and we therefore roll forward from the checkpoint
9958  * identified by the label file, NOT what pg_control says.  This avoids the
9959  * problem that pg_control might have been archived one or more checkpoints
9960  * later than the start of the dump, and so if we rely on it as the start
9961  * point, we will fail to restore a consistent database state.
9962  *
9963  * Returns TRUE if a backup_label was found (and fills the checkpoint
9964  * location and its REDO location into *checkPointLoc and RedoStartLSN,
9965  * respectively); returns FALSE if not. If this backup_label came from a
9966  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
9967  * was created during recovery, *backupFromStandby is set to TRUE.
9968  */
9969 static bool
9970 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
9971                                   bool *backupFromStandby)
9972 {
9973         char            startxlogfilename[MAXFNAMELEN];
9974         TimeLineID      tli;
9975         FILE       *lfp;
9976         char            ch;
9977         char            backuptype[20];
9978         char            backupfrom[20];
9979         uint32          hi,
9980                                 lo;
9981
9982         *backupEndRequired = false;
9983         *backupFromStandby = false;
9984
9985         /*
9986          * See if label file is present
9987          */
9988         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9989         if (!lfp)
9990         {
9991                 if (errno != ENOENT)
9992                         ereport(FATAL,
9993                                         (errcode_for_file_access(),
9994                                          errmsg("could not read file \"%s\": %m",
9995                                                         BACKUP_LABEL_FILE)));
9996                 return false;                   /* it's not there, all is fine */
9997         }
9998
9999         /*
10000          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
10001          * is pretty crude, but we are not expecting any variability in the file
10002          * format).
10003          */
10004         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
10005                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
10006                 ereport(FATAL,
10007                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10008                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10009         RedoStartLSN = ((uint64) hi) << 32 | lo;
10010         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
10011                            &hi, &lo, &ch) != 3 || ch != '\n')
10012                 ereport(FATAL,
10013                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10014                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10015         *checkPointLoc = ((uint64) hi) << 32 | lo;
10016
10017         /*
10018          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
10019          * from an older backup anyway, but since the information on it is not
10020          * strictly required, don't error out if it's missing for some reason.
10021          */
10022         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
10023         {
10024                 if (strcmp(backuptype, "streamed") == 0)
10025                         *backupEndRequired = true;
10026         }
10027
10028         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
10029         {
10030                 if (strcmp(backupfrom, "standby") == 0)
10031                         *backupFromStandby = true;
10032         }
10033
10034         if (ferror(lfp) || FreeFile(lfp))
10035                 ereport(FATAL,
10036                                 (errcode_for_file_access(),
10037                                  errmsg("could not read file \"%s\": %m",
10038                                                 BACKUP_LABEL_FILE)));
10039
10040         return true;
10041 }
10042
10043 /*
10044  * Error context callback for errors occurring during rm_redo().
10045  */
10046 static void
10047 rm_redo_error_callback(void *arg)
10048 {
10049         XLogReaderState *record = (XLogReaderState *) arg;
10050         StringInfoData buf;
10051
10052         initStringInfo(&buf);
10053         xlog_outdesc(&buf, record);
10054
10055         errcontext("xlog redo %s", buf.data);
10056
10057         pfree(buf.data);
10058 }
10059
10060 /*
10061  * BackupInProgress: check if online backup mode is active
10062  *
10063  * This is done by checking for existence of the "backup_label" file.
10064  */
10065 bool
10066 BackupInProgress(void)
10067 {
10068         struct stat stat_buf;
10069
10070         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
10071 }
10072
10073 /*
10074  * CancelBackup: rename the "backup_label" file to cancel backup mode
10075  *
10076  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
10077  * Note that this will render an online backup in progress useless.
10078  * To correctly finish an online backup, pg_stop_backup must be called.
10079  */
10080 void
10081 CancelBackup(void)
10082 {
10083         struct stat stat_buf;
10084
10085         /* if the file is not there, return */
10086         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
10087                 return;
10088
10089         /* remove leftover file from previously canceled backup if it exists */
10090         unlink(BACKUP_LABEL_OLD);
10091
10092         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
10093         {
10094                 ereport(LOG,
10095                                 (errmsg("online backup mode canceled"),
10096                                  errdetail("\"%s\" was renamed to \"%s\".",
10097                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10098         }
10099         else
10100         {
10101                 ereport(WARNING,
10102                                 (errcode_for_file_access(),
10103                                  errmsg("online backup mode was not canceled"),
10104                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
10105                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10106         }
10107 }
10108
10109 /*
10110  * Read the XLOG page containing RecPtr into readBuf (if not read already).
10111  * Returns number of bytes read, if the page is read successfully, or -1
10112  * in case of errors.  When errors occur, they are ereport'ed, but only
10113  * if they have not been previously reported.
10114  *
10115  * This is responsible for restoring files from archive as needed, as well
10116  * as for waiting for the requested WAL record to arrive in standby mode.
10117  *
10118  * 'emode' specifies the log level used for reporting "file not found" or
10119  * "end of WAL" situations in archive recovery, or in standby mode when a
10120  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10121  * false in those situations, on higher log levels the ereport() won't
10122  * return.
10123  *
10124  * In standby mode, if after a successful return of XLogPageRead() the
10125  * caller finds the record it's interested in to be broken, it should
10126  * ereport the error with the level determined by
10127  * emode_for_corrupt_record(), and then set lastSourceFailed
10128  * and call XLogPageRead() again with the same arguments. This lets
10129  * XLogPageRead() to try fetching the record from another source, or to
10130  * sleep and retry.
10131  */
10132 static int
10133 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
10134                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
10135 {
10136         XLogPageReadPrivate *private =
10137         (XLogPageReadPrivate *) xlogreader->private_data;
10138         int                     emode = private->emode;
10139         uint32          targetPageOff;
10140         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
10141
10142         XLByteToSeg(targetPagePtr, targetSegNo);
10143         targetPageOff = targetPagePtr % XLogSegSize;
10144
10145         /*
10146          * See if we need to switch to a new segment because the requested record
10147          * is not in the currently open one.
10148          */
10149         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
10150         {
10151                 /*
10152                  * Request a restartpoint if we've replayed too much xlog since the
10153                  * last one.
10154                  */
10155                 if (StandbyModeRequested && bgwriterLaunched)
10156                 {
10157                         if (XLogCheckpointNeeded(readSegNo))
10158                         {
10159                                 (void) GetRedoRecPtr();
10160                                 if (XLogCheckpointNeeded(readSegNo))
10161                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10162                         }
10163                 }
10164
10165                 close(readFile);
10166                 readFile = -1;
10167                 readSource = 0;
10168         }
10169
10170         XLByteToSeg(targetPagePtr, readSegNo);
10171
10172 retry:
10173         /* See if we need to retrieve more data */
10174         if (readFile < 0 ||
10175                 (readSource == XLOG_FROM_STREAM &&
10176                  receivedUpto < targetPagePtr + reqLen))
10177         {
10178                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
10179                                                                                  private->randAccess,
10180                                                                                  private->fetching_ckpt,
10181                                                                                  targetRecPtr))
10182                 {
10183                         if (readFile >= 0)
10184                                 close(readFile);
10185                         readFile = -1;
10186                         readLen = 0;
10187                         readSource = 0;
10188
10189                         return -1;
10190                 }
10191         }
10192
10193         /*
10194          * At this point, we have the right segment open and if we're streaming we
10195          * know the requested record is in it.
10196          */
10197         Assert(readFile != -1);
10198
10199         /*
10200          * If the current segment is being streamed from master, calculate how
10201          * much of the current page we have received already. We know the
10202          * requested record has been received, but this is for the benefit of
10203          * future calls, to allow quick exit at the top of this function.
10204          */
10205         if (readSource == XLOG_FROM_STREAM)
10206         {
10207                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
10208                         readLen = XLOG_BLCKSZ;
10209                 else
10210                         readLen = receivedUpto % XLogSegSize - targetPageOff;
10211         }
10212         else
10213                 readLen = XLOG_BLCKSZ;
10214
10215         /* Read the requested page */
10216         readOff = targetPageOff;
10217         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10218         {
10219                 char            fname[MAXFNAMELEN];
10220
10221                 XLogFileName(fname, curFileTLI, readSegNo);
10222                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10223                                 (errcode_for_file_access(),
10224                                  errmsg("could not seek in log segment %s to offset %u: %m",
10225                                                 fname, readOff)));
10226                 goto next_record_is_invalid;
10227         }
10228
10229         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10230         {
10231                 char            fname[MAXFNAMELEN];
10232
10233                 XLogFileName(fname, curFileTLI, readSegNo);
10234                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10235                                 (errcode_for_file_access(),
10236                                  errmsg("could not read from log segment %s, offset %u: %m",
10237                                                 fname, readOff)));
10238                 goto next_record_is_invalid;
10239         }
10240
10241         Assert(targetSegNo == readSegNo);
10242         Assert(targetPageOff == readOff);
10243         Assert(reqLen <= readLen);
10244
10245         *readTLI = curFileTLI;
10246         return readLen;
10247
10248 next_record_is_invalid:
10249         lastSourceFailed = true;
10250
10251         if (readFile >= 0)
10252                 close(readFile);
10253         readFile = -1;
10254         readLen = 0;
10255         readSource = 0;
10256
10257         /* In standby-mode, keep trying */
10258         if (StandbyMode)
10259                 goto retry;
10260         else
10261                 return -1;
10262 }
10263
10264 /*
10265  * Open the WAL segment containing WAL position 'RecPtr'.
10266  *
10267  * The segment can be fetched via restore_command, or via walreceiver having
10268  * streamed the record, or it can already be present in pg_xlog. Checking
10269  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
10270  * too, in case someone copies a new segment directly to pg_xlog. That is not
10271  * documented or recommended, though.
10272  *
10273  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
10274  * prepare to read WAL starting from RedoStartLSN after this.
10275  *
10276  * 'RecPtr' might not point to the beginning of the record we're interested
10277  * in, it might also point to the page or segment header. In that case,
10278  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
10279  * used to decide which timeline to stream the requested WAL from.
10280  *
10281  * If the record is not immediately available, the function returns false
10282  * if we're not in standby mode. In standby mode, waits for it to become
10283  * available.
10284  *
10285  * When the requested record becomes available, the function opens the file
10286  * containing it (if not open already), and returns true. When end of standby
10287  * mode is triggered by the user, and there is no more WAL available, returns
10288  * false.
10289  */
10290 static bool
10291 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
10292                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
10293 {
10294         static pg_time_t last_fail_time = 0;
10295         pg_time_t       now;
10296
10297         /*-------
10298          * Standby mode is implemented by a state machine:
10299          *
10300          * 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE), or just
10301          *        pg_xlog (XLOG_FROM_XLOG)
10302          * 2. Check trigger file
10303          * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
10304          * 4. Rescan timelines
10305          * 5. Sleep 5 seconds, and loop back to 1.
10306          *
10307          * Failure to read from the current source advances the state machine to
10308          * the next state.
10309          *
10310          * 'currentSource' indicates the current state. There are no currentSource
10311          * values for "check trigger", "rescan timelines", and "sleep" states,
10312          * those actions are taken when reading from the previous source fails, as
10313          * part of advancing to the next state.
10314          *-------
10315          */
10316         if (!InArchiveRecovery)
10317                 currentSource = XLOG_FROM_PG_XLOG;
10318         else if (currentSource == 0)
10319                 currentSource = XLOG_FROM_ARCHIVE;
10320
10321         for (;;)
10322         {
10323                 int                     oldSource = currentSource;
10324
10325                 /*
10326                  * First check if we failed to read from the current source, and
10327                  * advance the state machine if so. The failure to read might've
10328                  * happened outside this function, e.g when a CRC check fails on a
10329                  * record, or within this loop.
10330                  */
10331                 if (lastSourceFailed)
10332                 {
10333                         switch (currentSource)
10334                         {
10335                                 case XLOG_FROM_ARCHIVE:
10336                                 case XLOG_FROM_PG_XLOG:
10337
10338                                         /*
10339                                          * Check to see if the trigger file exists. Note that we
10340                                          * do this only after failure, so when you create the
10341                                          * trigger file, we still finish replaying as much as we
10342                                          * can from archive and pg_xlog before failover.
10343                                          */
10344                                         if (StandbyMode && CheckForStandbyTrigger())
10345                                         {
10346                                                 ShutdownWalRcv();
10347                                                 return false;
10348                                         }
10349
10350                                         /*
10351                                          * Not in standby mode, and we've now tried the archive
10352                                          * and pg_xlog.
10353                                          */
10354                                         if (!StandbyMode)
10355                                                 return false;
10356
10357                                         /*
10358                                          * If primary_conninfo is set, launch walreceiver to try
10359                                          * to stream the missing WAL.
10360                                          *
10361                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
10362                                          * checkpoint location. In that case, we use RedoStartLSN
10363                                          * as the streaming start position instead of RecPtr, so
10364                                          * that when we later jump backwards to start redo at
10365                                          * RedoStartLSN, we will have the logs streamed already.
10366                                          */
10367                                         if (PrimaryConnInfo)
10368                                         {
10369                                                 XLogRecPtr      ptr;
10370                                                 TimeLineID      tli;
10371
10372                                                 if (fetching_ckpt)
10373                                                 {
10374                                                         ptr = RedoStartLSN;
10375                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
10376                                                 }
10377                                                 else
10378                                                 {
10379                                                         ptr = tliRecPtr;
10380                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
10381
10382                                                         if (curFileTLI > 0 && tli < curFileTLI)
10383                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
10384                                                                          (uint32) (ptr >> 32), (uint32) ptr,
10385                                                                          tli, curFileTLI);
10386                                                 }
10387                                                 curFileTLI = tli;
10388                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
10389                                                                                          PrimarySlotName);
10390                                                 receivedUpto = 0;
10391                                         }
10392
10393                                         /*
10394                                          * Move to XLOG_FROM_STREAM state in either case. We'll
10395                                          * get immediate failure if we didn't launch walreceiver,
10396                                          * and move on to the next state.
10397                                          */
10398                                         currentSource = XLOG_FROM_STREAM;
10399                                         break;
10400
10401                                 case XLOG_FROM_STREAM:
10402
10403                                         /*
10404                                          * Failure while streaming. Most likely, we got here
10405                                          * because streaming replication was terminated, or
10406                                          * promotion was triggered. But we also get here if we
10407                                          * find an invalid record in the WAL streamed from master,
10408                                          * in which case something is seriously wrong. There's
10409                                          * little chance that the problem will just go away, but
10410                                          * PANIC is not good for availability either, especially
10411                                          * in hot standby mode. So, we treat that the same as
10412                                          * disconnection, and retry from archive/pg_xlog again.
10413                                          * The WAL in the archive should be identical to what was
10414                                          * streamed, so it's unlikely that it helps, but one can
10415                                          * hope...
10416                                          */
10417
10418                                         /*
10419                                          * Before we leave XLOG_FROM_STREAM state, make sure that
10420                                          * walreceiver is not active, so that it won't overwrite
10421                                          * WAL that we restore from archive.
10422                                          */
10423                                         if (WalRcvStreaming())
10424                                                 ShutdownWalRcv();
10425
10426                                         /*
10427                                          * Before we sleep, re-scan for possible new timelines if
10428                                          * we were requested to recover to the latest timeline.
10429                                          */
10430                                         if (recoveryTargetIsLatest)
10431                                         {
10432                                                 if (rescanLatestTimeLine())
10433                                                 {
10434                                                         currentSource = XLOG_FROM_ARCHIVE;
10435                                                         break;
10436                                                 }
10437                                         }
10438
10439                                         /*
10440                                          * XLOG_FROM_STREAM is the last state in our state
10441                                          * machine, so we've exhausted all the options for
10442                                          * obtaining the requested WAL. We're going to loop back
10443                                          * and retry from the archive, but if it hasn't been long
10444                                          * since last attempt, sleep 5 seconds to avoid
10445                                          * busy-waiting.
10446                                          */
10447                                         now = (pg_time_t) time(NULL);
10448                                         if ((now - last_fail_time) < 5)
10449                                         {
10450                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
10451                                                 now = (pg_time_t) time(NULL);
10452                                         }
10453                                         last_fail_time = now;
10454                                         currentSource = XLOG_FROM_ARCHIVE;
10455                                         break;
10456
10457                                 default:
10458                                         elog(ERROR, "unexpected WAL source %d", currentSource);
10459                         }
10460                 }
10461                 else if (currentSource == XLOG_FROM_PG_XLOG)
10462                 {
10463                         /*
10464                          * We just successfully read a file in pg_xlog. We prefer files in
10465                          * the archive over ones in pg_xlog, so try the next file again
10466                          * from the archive first.
10467                          */
10468                         if (InArchiveRecovery)
10469                                 currentSource = XLOG_FROM_ARCHIVE;
10470                 }
10471
10472                 if (currentSource != oldSource)
10473                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
10474                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
10475                                  lastSourceFailed ? "failure" : "success");
10476
10477                 /*
10478                  * We've now handled possible failure. Try to read from the chosen
10479                  * source.
10480                  */
10481                 lastSourceFailed = false;
10482
10483                 switch (currentSource)
10484                 {
10485                         case XLOG_FROM_ARCHIVE:
10486                         case XLOG_FROM_PG_XLOG:
10487                                 /* Close any old file we might have open. */
10488                                 if (readFile >= 0)
10489                                 {
10490                                         close(readFile);
10491                                         readFile = -1;
10492                                 }
10493                                 /* Reset curFileTLI if random fetch. */
10494                                 if (randAccess)
10495                                         curFileTLI = 0;
10496
10497                                 /*
10498                                  * Try to restore the file from archive, or read an existing
10499                                  * file from pg_xlog.
10500                                  */
10501                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
10502                                                  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
10503                                                                                           currentSource);
10504                                 if (readFile >= 0)
10505                                         return true;    /* success! */
10506
10507                                 /*
10508                                  * Nope, not found in archive or pg_xlog.
10509                                  */
10510                                 lastSourceFailed = true;
10511                                 break;
10512
10513                         case XLOG_FROM_STREAM:
10514                                 {
10515                                         bool            havedata;
10516
10517                                         /*
10518                                          * Check if WAL receiver is still active.
10519                                          */
10520                                         if (!WalRcvStreaming())
10521                                         {
10522                                                 lastSourceFailed = true;
10523                                                 break;
10524                                         }
10525
10526                                         /*
10527                                          * Walreceiver is active, so see if new data has arrived.
10528                                          *
10529                                          * We only advance XLogReceiptTime when we obtain fresh
10530                                          * WAL from walreceiver and observe that we had already
10531                                          * processed everything before the most recent "chunk"
10532                                          * that it flushed to disk.  In steady state where we are
10533                                          * keeping up with the incoming data, XLogReceiptTime will
10534                                          * be updated on each cycle. When we are behind,
10535                                          * XLogReceiptTime will not advance, so the grace time
10536                                          * allotted to conflicting queries will decrease.
10537                                          */
10538                                         if (RecPtr < receivedUpto)
10539                                                 havedata = true;
10540                                         else
10541                                         {
10542                                                 XLogRecPtr      latestChunkStart;
10543
10544                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
10545                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
10546                                                 {
10547                                                         havedata = true;
10548                                                         if (latestChunkStart <= RecPtr)
10549                                                         {
10550                                                                 XLogReceiptTime = GetCurrentTimestamp();
10551                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
10552                                                         }
10553                                                 }
10554                                                 else
10555                                                         havedata = false;
10556                                         }
10557                                         if (havedata)
10558                                         {
10559                                                 /*
10560                                                  * Great, streamed far enough.  Open the file if it's
10561                                                  * not open already.  Also read the timeline history
10562                                                  * file if we haven't initialized timeline history
10563                                                  * yet; it should be streamed over and present in
10564                                                  * pg_xlog by now.  Use XLOG_FROM_STREAM so that
10565                                                  * source info is set correctly and XLogReceiptTime
10566                                                  * isn't changed.
10567                                                  */
10568                                                 if (readFile < 0)
10569                                                 {
10570                                                         if (!expectedTLEs)
10571                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
10572                                                         readFile = XLogFileRead(readSegNo, PANIC,
10573                                                                                                         receiveTLI,
10574                                                                                                         XLOG_FROM_STREAM, false);
10575                                                         Assert(readFile >= 0);
10576                                                 }
10577                                                 else
10578                                                 {
10579                                                         /* just make sure source info is correct... */
10580                                                         readSource = XLOG_FROM_STREAM;
10581                                                         XLogReceiptSource = XLOG_FROM_STREAM;
10582                                                         return true;
10583                                                 }
10584                                                 break;
10585                                         }
10586
10587                                         /*
10588                                          * Data not here yet. Check for trigger, then wait for
10589                                          * walreceiver to wake us up when new WAL arrives.
10590                                          */
10591                                         if (CheckForStandbyTrigger())
10592                                         {
10593                                                 /*
10594                                                  * Note that we don't "return false" immediately here.
10595                                                  * After being triggered, we still want to replay all
10596                                                  * the WAL that was already streamed. It's in pg_xlog
10597                                                  * now, so we just treat this as a failure, and the
10598                                                  * state machine will move on to replay the streamed
10599                                                  * WAL from pg_xlog, and then recheck the trigger and
10600                                                  * exit replay.
10601                                                  */
10602                                                 lastSourceFailed = true;
10603                                                 break;
10604                                         }
10605
10606                                         /*
10607                                          * Wait for more WAL to arrive. Time out after 5 seconds,
10608                                          * like when polling the archive, to react to a trigger
10609                                          * file promptly.
10610                                          */
10611                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
10612                                                           WL_LATCH_SET | WL_TIMEOUT,
10613                                                           5000L);
10614                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
10615                                         break;
10616                                 }
10617
10618                         default:
10619                                 elog(ERROR, "unexpected WAL source %d", currentSource);
10620                 }
10621
10622                 /*
10623                  * This possibly-long loop needs to handle interrupts of startup
10624                  * process.
10625                  */
10626                 HandleStartupProcInterrupts();
10627         }
10628
10629         return false;                           /* not reached */
10630 }
10631
10632 /*
10633  * Determine what log level should be used to report a corrupt WAL record
10634  * in the current WAL page, previously read by XLogPageRead().
10635  *
10636  * 'emode' is the error mode that would be used to report a file-not-found
10637  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
10638  * we're retrying the exact same record that we've tried previously, only
10639  * complain the first time to keep the noise down.  However, we only do when
10640  * reading from pg_xlog, because we don't expect any invalid records in archive
10641  * or in records streamed from master. Files in the archive should be complete,
10642  * and we should never hit the end of WAL because we stop and wait for more WAL
10643  * to arrive before replaying it.
10644  *
10645  * NOTE: This function remembers the RecPtr value it was last called with,
10646  * to suppress repeated messages about the same record. Only call this when
10647  * you are about to ereport(), or you might cause a later message to be
10648  * erroneously suppressed.
10649  */
10650 static int
10651 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
10652 {
10653         static XLogRecPtr lastComplaint = 0;
10654
10655         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
10656         {
10657                 if (RecPtr == lastComplaint)
10658                         emode = DEBUG1;
10659                 else
10660                         lastComplaint = RecPtr;
10661         }
10662         return emode;
10663 }
10664
10665 /*
10666  * Check to see whether the user-specified trigger file exists and whether a
10667  * promote request has arrived.  If either condition holds, return true.
10668  */
10669 static bool
10670 CheckForStandbyTrigger(void)
10671 {
10672         struct stat stat_buf;
10673         static bool triggered = false;
10674
10675         if (triggered)
10676                 return true;
10677
10678         if (IsPromoteTriggered())
10679         {
10680                 /*
10681                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
10682                  * signal handler. It now leaves the file in place and lets the
10683                  * Startup process do the unlink. This allows Startup to know whether
10684                  * it should create a full checkpoint before starting up (fallback
10685                  * mode). Fast promotion takes precedence.
10686                  */
10687                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
10688                 {
10689                         unlink(PROMOTE_SIGNAL_FILE);
10690                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
10691                         fast_promote = true;
10692                 }
10693                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
10694                 {
10695                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
10696                         fast_promote = false;
10697                 }
10698
10699                 ereport(LOG, (errmsg("received promote request")));
10700
10701                 ResetPromoteTriggered();
10702                 triggered = true;
10703                 return true;
10704         }
10705
10706         if (TriggerFile == NULL)
10707                 return false;
10708
10709         if (stat(TriggerFile, &stat_buf) == 0)
10710         {
10711                 ereport(LOG,
10712                                 (errmsg("trigger file found: %s", TriggerFile)));
10713                 unlink(TriggerFile);
10714                 triggered = true;
10715                 fast_promote = true;
10716                 return true;
10717         }
10718         else if (errno != ENOENT)
10719                 ereport(ERROR,
10720                                 (errcode_for_file_access(),
10721                                  errmsg("could not stat trigger file \"%s\": %m",
10722                                                 TriggerFile)));
10723
10724         return false;
10725 }
10726
10727 /*
10728  * Check to see if a promote request has arrived. Should be
10729  * called by postmaster after receiving SIGUSR1.
10730  */
10731 bool
10732 CheckPromoteSignal(void)
10733 {
10734         struct stat stat_buf;
10735
10736         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
10737                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
10738                 return true;
10739
10740         return false;
10741 }
10742
10743 /*
10744  * Wake up startup process to replay newly arrived WAL, or to notice that
10745  * failover has been requested.
10746  */
10747 void
10748 WakeupRecovery(void)
10749 {
10750         SetLatch(&XLogCtl->recoveryWakeupLatch);
10751 }
10752
10753 /*
10754  * Update the WalWriterSleeping flag.
10755  */
10756 void
10757 SetWalWriterSleeping(bool sleeping)
10758 {
10759         SpinLockAcquire(&XLogCtl->info_lck);
10760         XLogCtl->WalWriterSleeping = sleeping;
10761         SpinLockRelease(&XLogCtl->info_lck);
10762 }