granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <time.h>
  19 #include <fcntl.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <unistd.h>
  23
  24 #include "access/clog.h"
  25 #include "access/multixact.h"
  26 #include "access/subtrans.h"
  27 #include "access/timeline.h"
  28 #include "access/transam.h"
  29 #include "access/tuptoaster.h"
  30 #include "access/twophase.h"
  31 #include "access/xact.h"
  32 #include "access/xlog_internal.h"
  33 #include "access/xlogreader.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_database.h"
  38 #include "miscadmin.h"
  39 #include "pgstat.h"
  40 #include "postmaster/bgwriter.h"
  41 #include "postmaster/startup.h"
  42 #include "replication/slot.h"
  43 #include "replication/walreceiver.h"
  44 #include "replication/walsender.h"
  45 #include "storage/barrier.h"
  46 #include "storage/bufmgr.h"
  47 #include "storage/fd.h"
  48 #include "storage/ipc.h"
  49 #include "storage/latch.h"
  50 #include "storage/pmsignal.h"
  51 #include "storage/predicate.h"
  52 #include "storage/proc.h"
  53 #include "storage/procarray.h"
  54 #include "storage/reinit.h"
  55 #include "storage/smgr.h"
  56 #include "storage/spin.h"
  57 #include "utils/builtins.h"
  58 #include "utils/guc.h"
  59 #include "utils/ps_status.h"
  60 #include "utils/relmapper.h"
  61 #include "utils/snapmgr.h"
  62 #include "utils/timestamp.h"
  63 #include "pg_trace.h"
  64
  65 extern uint32 bootstrap_data_checksum_version;
  66
  67 /* File path names (all relative to $PGDATA) */
  68 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  69 #define RECOVERY_COMMAND_DONE   "recovery.done"
  70 #define PROMOTE_SIGNAL_FILE             "promote"
  71 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
  72
  73
  74 /* User-settable parameters */
  75 int                     CheckPointSegments = 3;
  76 int                     wal_keep_segments = 0;
  77 int                     XLOGbuffers = -1;
  78 int                     XLogArchiveTimeout = 0;
  79 bool            XLogArchiveMode = false;
  80 char       *XLogArchiveCommand = NULL;
  81 bool            EnableHotStandby = false;
  82 bool            fullPageWrites = true;
  83 bool            wal_log_hints = false;
  84 bool            log_checkpoints = false;
  85 int                     sync_method = DEFAULT_SYNC_METHOD;
  86 int                     wal_level = WAL_LEVEL_MINIMAL;
  87 int                     CommitDelay = 0;        /* precommit delay in microseconds */
  88 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  89 int                     num_xloginsert_slots = 8;
  90
  91 #ifdef WAL_DEBUG
  92 bool            XLOG_DEBUG = false;
  93 #endif
  94
  95 /*
  96  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  97  * When we are done with an old XLOG segment file, we will recycle it as a
  98  * future XLOG segment as long as there aren't already XLOGfileslop future
  99  * segments; else we'll delete it.  This could be made a separate GUC
 100  * variable, but at present I think it's sufficient to hardwire it as
 101  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
 102  * no more than 2*CheckPointSegments log segments, and we want to recycle all
 103  * of them; the +1 allows boundary cases to happen without wasting a
 104  * delete/create-segment cycle.
 105  */
 106 #define XLOGfileslop    (2*CheckPointSegments + 1)
 107
 108
 109 /*
 110  * GUC support
 111  */
 112 const struct config_enum_entry sync_method_options[] = {
 113         {"fsync", SYNC_METHOD_FSYNC, false},
 114 #ifdef HAVE_FSYNC_WRITETHROUGH
 115         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 116 #endif
 117 #ifdef HAVE_FDATASYNC
 118         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 119 #endif
 120 #ifdef OPEN_SYNC_FLAG
 121         {"open_sync", SYNC_METHOD_OPEN, false},
 122 #endif
 123 #ifdef OPEN_DATASYNC_FLAG
 124         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 125 #endif
 126         {NULL, 0, false}
 127 };
 128
 129 /*
 130  * Statistics for current checkpoint are collected in this global struct.
 131  * Because only the background writer or a stand-alone backend can perform
 132  * checkpoints, this will be unused in normal backends.
 133  */
 134 CheckpointStatsData CheckpointStats;
 135
 136 /*
 137  * ThisTimeLineID will be same in all backends --- it identifies current
 138  * WAL timeline for the database system.
 139  */
 140 TimeLineID      ThisTimeLineID = 0;
 141
 142 /*
 143  * Are we doing recovery from XLOG?
 144  *
 145  * This is only ever true in the startup process; it should be read as meaning
 146  * "this process is replaying WAL records", rather than "the system is in
 147  * recovery mode".  It should be examined primarily by functions that need
 148  * to act differently when called from a WAL redo function (e.g., to skip WAL
 149  * logging).  To check whether the system is in recovery regardless of which
 150  * process you're running in, use RecoveryInProgress() but only after shared
 151  * memory startup and lock initialization.
 152  */
 153 bool            InRecovery = false;
 154
 155 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 156 HotStandbyState standbyState = STANDBY_DISABLED;
 157
 158 static XLogRecPtr LastRec;
 159
 160 /* Local copy of WalRcv->receivedUpto */
 161 static XLogRecPtr receivedUpto = 0;
 162 static TimeLineID receiveTLI = 0;
 163
 164 /*
 165  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 166  * the replayed WAL records indicate. It's initialized with full_page_writes
 167  * that the recovery starting checkpoint record indicates, and then updated
 168  * each time XLOG_FPW_CHANGE record is replayed.
 169  */
 170 static bool lastFullPageWrites;
 171
 172 /*
 173  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 174  * known, need to check the shared state".
 175  */
 176 static bool LocalRecoveryInProgress = true;
 177
 178 /*
 179  * Local copy of SharedHotStandbyActive variable. False actually means "not
 180  * known, need to check the shared state".
 181  */
 182 static bool LocalHotStandbyActive = false;
 183
 184 /*
 185  * Local state for XLogInsertAllowed():
 186  *              1: unconditionally allowed to insert XLOG
 187  *              0: unconditionally not allowed to insert XLOG
 188  *              -1: must check RecoveryInProgress(); disallow until it is false
 189  * Most processes start with -1 and transition to 1 after seeing that recovery
 190  * is not in progress.  But we can also force the value for special cases.
 191  * The coding in XLogInsertAllowed() depends on the first two of these states
 192  * being numerically the same as bool true and false.
 193  */
 194 static int      LocalXLogInsertAllowed = -1;
 195
 196 /*
 197  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 198  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 199  * currently recovering using offline XLOG archives. These variables are only
 200  * valid in the startup process.
 201  *
 202  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 203  * currently performing crash recovery using only XLOG files in pg_xlog, but
 204  * will switch to using offline XLOG archives as soon as we reach the end of
 205  * WAL in pg_xlog.
 206 */
 207 bool            ArchiveRecoveryRequested = false;
 208 bool            InArchiveRecovery = false;
 209
 210 /* Was the last xlog file restored from archive, or local? */
 211 static bool restoredFromArchive = false;
 212
 213 /* options taken from recovery.conf for archive recovery */
 214 char       *recoveryRestoreCommand = NULL;
 215 static char *recoveryEndCommand = NULL;
 216 static char *archiveCleanupCommand = NULL;
 217 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 218 static bool recoveryTargetInclusive = true;
 219 static bool recoveryPauseAtTarget = true;
 220 static TransactionId recoveryTargetXid;
 221 static TimestampTz recoveryTargetTime;
 222 static char *recoveryTargetName;
 223 static int min_recovery_apply_delay = 0;
 224 static TimestampTz recoveryDelayUntilTime;
 225
 226 /* options taken from recovery.conf for XLOG streaming */
 227 static bool StandbyModeRequested = false;
 228 static char *PrimaryConnInfo = NULL;
 229 static char *PrimarySlotName = NULL;
 230 static char *TriggerFile = NULL;
 231
 232 /* are we currently in standby mode? */
 233 bool            StandbyMode = false;
 234
 235 /* whether request for fast promotion has been made yet */
 236 static bool fast_promote = false;
 237
 238 /*
 239  * if recoveryStopsBefore/After returns true, it saves information of the stop
 240  * point here
 241  */
 242 static TransactionId recoveryStopXid;
 243 static TimestampTz recoveryStopTime;
 244 static char recoveryStopName[MAXFNAMELEN];
 245 static bool recoveryStopAfter;
 246
 247 /*
 248  * During normal operation, the only timeline we care about is ThisTimeLineID.
 249  * During recovery, however, things are more complicated.  To simplify life
 250  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 251  * scan through the WAL history (that is, it is the line that was active when
 252  * the currently-scanned WAL record was generated).  We also need these
 253  * timeline values:
 254  *
 255  * recoveryTargetTLI: the desired timeline that we want to end in.
 256  *
 257  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 258  *
 259  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 260  * its known parents, newest first (so recoveryTargetTLI is always the
 261  * first list member).  Only these TLIs are expected to be seen in the WAL
 262  * segments we read, and indeed only these TLIs will be considered as
 263  * candidate WAL files to open at all.
 264  *
 265  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 266  * (This is not necessarily the same as ThisTimeLineID, because we could
 267  * be scanning data that was copied from an ancestor timeline when the current
 268  * file was created.)  During a sequential scan we do not allow this value
 269  * to decrease.
 270  */
 271 static TimeLineID recoveryTargetTLI;
 272 static bool recoveryTargetIsLatest = false;
 273 static List *expectedTLEs;
 274 static TimeLineID curFileTLI;
 275
 276 /*
 277  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 278  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 279  * end+1 of the last record, and is reset when we end a top-level transaction,
 280  * or start a new one; so it can be used to tell if the current transaction has
 281  * created any XLOG records.
 282  */
 283 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
 284
 285 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 286
 287 /*
 288  * RedoRecPtr is this backend's local copy of the REDO record pointer
 289  * (which is almost but not quite the same as a pointer to the most recent
 290  * CHECKPOINT record).  We update this from the shared-memory copy,
 291  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 292  * hold an insertion slot).  See XLogInsert for details.  We are also allowed
 293  * to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 294  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 295  * InitXLOGAccess.
 296  */
 297 static XLogRecPtr RedoRecPtr;
 298
 299 /*
 300  * RedoStartLSN points to the checkpoint's REDO location which is specified
 301  * in a backup label file, backup history file or control file. In standby
 302  * mode, XLOG streaming usually starts from the position where an invalid
 303  * record was found. But if we fail to read even the initial checkpoint
 304  * record, we use the REDO location instead of the checkpoint location as
 305  * the start position of XLOG streaming. Otherwise we would have to jump
 306  * backwards to the REDO location after reading the checkpoint record,
 307  * because the REDO record can precede the checkpoint record.
 308  */
 309 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 310
 311 /*----------
 312  * Shared-memory data structures for XLOG control
 313  *
 314  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 315  * the log up to (all records before that point must be written or fsynced).
 316  * LogwrtResult indicates the byte positions we have already written/fsynced.
 317  * These structs are identical but are declared separately to indicate their
 318  * slightly different functions.
 319  *
 320  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 321  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 322  * this arrangement is that the value can be examined by code that already
 323  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 324  * to the shared variable, each backend has a private copy of LogwrtResult,
 325  * which is updated when convenient.
 326  *
 327  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 328  * (protected by info_lck), but we don't need to cache any copies of it.
 329  *
 330  * info_lck is only held long enough to read/update the protected variables,
 331  * so it's a plain spinlock.  The other locks are held longer (potentially
 332  * over I/O operations), so we use LWLocks for them.  These locks are:
 333  *
 334  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 335  * It is only held while initializing and changing the mapping.  If the
 336  * contents of the buffer being replaced haven't been written yet, the mapping
 337  * lock is released while the write is done, and reacquired afterwards.
 338  *
 339  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 340  * XLogFlush).
 341  *
 342  * ControlFileLock: must be held to read/update control file or create
 343  * new log file.
 344  *
 345  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 346  * only one checkpointer at a time; currently, with all checkpoints done by
 347  * the checkpointer, this is just pro forma).
 348  *
 349  *----------
 350  */
 351
 352 typedef struct XLogwrtRqst
 353 {
 354         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 355         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 356 } XLogwrtRqst;
 357
 358 typedef struct XLogwrtResult
 359 {
 360         XLogRecPtr      Write;                  /* last byte + 1 written out */
 361         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 362 } XLogwrtResult;
 363
 364
 365 /*
 366  * A slot for inserting to the WAL. This is similar to an LWLock, the main
 367  * difference is that there is an extra xlogInsertingAt field that is protected
 368  * by the same mutex. Unlike an LWLock, a slot can only be acquired in
 369  * exclusive mode.
 370  *
 371  * The xlogInsertingAt field is used to advertise to other processes how far
 372  * the slot owner has progressed in inserting the record. When a backend
 373  * acquires a slot, it initializes xlogInsertingAt to 1, because it doesn't
 374  * yet know where it's going to insert the record. That's conservative
 375  * but correct; the new insertion is certainly going to go to a byte position
 376  * greater than 1. If another backend needs to flush the WAL, it will have to
 377  * wait for the new insertion. xlogInsertingAt is updated after finishing the
 378  * insert or when crossing a page boundary, which will wake up anyone waiting
 379  * for it, whether the wait was necessary in the first place or not.
 380  *
 381  * A process can wait on a slot in two modes: LW_EXCLUSIVE or
 382  * LW_WAIT_UNTIL_FREE. LW_EXCLUSIVE works like in an lwlock; when the slot is
 383  * released, the first LW_EXCLUSIVE waiter in the queue is woken up. Processes
 384  * waiting in LW_WAIT_UNTIL_FREE mode are woken up whenever the slot is
 385  * released, or xlogInsertingAt is updated. In other words, a process in
 386  * LW_WAIT_UNTIL_FREE mode is woken up whenever the inserter makes any progress
 387  * copying the record in place. LW_WAIT_UNTIL_FREE waiters are always added to
 388  * the front of the queue, while LW_EXCLUSIVE waiters are appended to the end.
 389  *
 390  * To join the wait queue, a process must set MyProc->lwWaitMode to the mode
 391  * it wants to wait in, MyProc->lwWaiting to true, and link MyProc to the head
 392  * or tail of the wait queue. The same mechanism is used to wait on an LWLock,
 393  * see lwlock.c for details.
 394  */
 395 typedef struct
 396 {
 397         slock_t         mutex;                  /* protects the below fields */
 398         XLogRecPtr      xlogInsertingAt; /* insert has completed up to this point */
 399
 400         PGPROC     *owner;                      /* for debugging purposes */
 401
 402         bool            releaseOK;              /* T if ok to release waiters */
 403         char            exclusive;              /* # of exclusive holders (0 or 1) */
 404         PGPROC     *head;                       /* head of list of waiting PGPROCs */
 405         PGPROC     *tail;                       /* tail of list of waiting PGPROCs */
 406         /* tail is undefined when head is NULL */
 407 } XLogInsertSlot;
 408
 409 /*
 410  * All the slots are allocated as an array in shared memory. We force the
 411  * array stride to be a power of 2, which saves a few cycles in indexing, but
 412  * more importantly also ensures that individual slots don't cross cache line
 413  * boundaries.  (Of course, we have to also ensure that the array start
 414  * address is suitably aligned.)
 415  */
 416 typedef union XLogInsertSlotPadded
 417 {
 418         XLogInsertSlot slot;
 419         char            pad[CACHE_LINE_SIZE];
 420 } XLogInsertSlotPadded;
 421
 422 /*
 423  * Shared state data for XLogInsert.
 424  */
 425 typedef struct XLogCtlInsert
 426 {
 427         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 428
 429         /*
 430          * CurrBytePos is the end of reserved WAL. The next record will be inserted
 431          * at that position. PrevBytePos is the start position of the previously
 432          * inserted (or rather, reserved) record - it is copied to the prev-link
 433          * of the next record. These are stored as "usable byte positions" rather
 434          * than XLogRecPtrs (see XLogBytePosToRecPtr()).
 435          */
 436         uint64          CurrBytePos;
 437         uint64          PrevBytePos;
 438
 439         /*
 440          * Make sure the above heavily-contended spinlock and byte positions are
 441          * on their own cache line. In particular, the RedoRecPtr and full page
 442          * write variables below should be on a different cache line. They are
 443          * read on every WAL insertion, but updated rarely, and we don't want
 444          * those reads to steal the cache line containing Curr/PrevBytePos.
 445          */
 446         char            pad[CACHE_LINE_SIZE];
 447
 448         /*
 449          * fullPageWrites is the master copy used by all backends to determine
 450          * whether to write full-page to WAL, instead of using process-local one.
 451          * This is required because, when full_page_writes is changed by SIGHUP,
 452          * we must WAL-log it before it actually affects WAL-logging by backends.
 453          * Checkpointer sets at startup or after SIGHUP.
 454          *
 455          * To read these fields, you must hold an insertion slot. To modify them,
 456          * you must hold ALL the slots.
 457          */
 458         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 459         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 460         bool            fullPageWrites;
 461
 462         /*
 463          * exclusiveBackup is true if a backup started with pg_start_backup() is
 464          * in progress, and nonExclusiveBackups is a counter indicating the number
 465          * of streaming base backups currently in progress. forcePageWrites is set
 466          * to true when either of these is non-zero. lastBackupStart is the latest
 467          * checkpoint redo location used as a starting point for an online backup.
 468          */
 469         bool            exclusiveBackup;
 470         int                     nonExclusiveBackups;
 471         XLogRecPtr      lastBackupStart;
 472
 473         /* insertion slots, see XLogInsertSlot struct above for details */
 474         XLogInsertSlotPadded *insertSlots;
 475 } XLogCtlInsert;
 476
 477 /*
 478  * Total shared-memory state for XLOG.
 479  */
 480 typedef struct XLogCtlData
 481 {
 482         XLogCtlInsert Insert;
 483
 484         /* Protected by info_lck: */
 485         XLogwrtRqst LogwrtRqst;
 486         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 487         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 488         TransactionId ckptXid;
 489         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 490         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
 491
 492         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
 493                                                                                  * segment */
 494
 495         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
 496         XLogRecPtr      unloggedLSN;
 497         slock_t         ulsn_lck;
 498
 499         /* Time of last xlog segment switch. Protected by WALWriteLock. */
 500         pg_time_t       lastSegSwitchTime;
 501
 502         /*
 503          * Protected by info_lck and WALWriteLock (you must hold either lock to
 504          * read it, but both to update)
 505          */
 506         XLogwrtResult LogwrtResult;
 507
 508         /*
 509          * Latest initialized page in the cache (last byte position + 1).
 510          *
 511          * To change the identity of a buffer (and InitializedUpTo), you need to
 512          * hold WALBufMappingLock.  To change the identity of a buffer that's still
 513          * dirty, the old page needs to be written out first, and for that you
 514          * need WALWriteLock, and you need to ensure that there are no in-progress
 515          * insertions to the page by calling WaitXLogInsertionsToFinish().
 516          */
 517         XLogRecPtr      InitializedUpTo;
 518
 519         /*
 520          * These values do not change after startup, although the pointed-to pages
 521          * and xlblocks values certainly do.  xlblock values are protected by
 522          * WALBufMappingLock.
 523          */
 524         char       *pages;                      /* buffers for unwritten XLOG pages */
 525         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 526         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 527
 528         /*
 529          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 530          * If we created a new timeline when the system was started up,
 531          * PrevTimeLineID is the old timeline's ID that we forked off from.
 532          * Otherwise it's equal to ThisTimeLineID.
 533          */
 534         TimeLineID      ThisTimeLineID;
 535         TimeLineID      PrevTimeLineID;
 536
 537         /*
 538          * archiveCleanupCommand is read from recovery.conf but needs to be in
 539          * shared memory so that the checkpointer process can access it.
 540          */
 541         char            archiveCleanupCommand[MAXPGPATH];
 542
 543         /*
 544          * SharedRecoveryInProgress indicates if we're still in crash or archive
 545          * recovery.  Protected by info_lck.
 546          */
 547         bool            SharedRecoveryInProgress;
 548
 549         /*
 550          * SharedHotStandbyActive indicates if we're still in crash or archive
 551          * recovery.  Protected by info_lck.
 552          */
 553         bool            SharedHotStandbyActive;
 554
 555         /*
 556          * WalWriterSleeping indicates whether the WAL writer is currently in
 557          * low-power mode (and hence should be nudged if an async commit occurs).
 558          * Protected by info_lck.
 559          */
 560         bool            WalWriterSleeping;
 561
 562         /*
 563          * recoveryWakeupLatch is used to wake up the startup process to continue
 564          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 565          * to appear.
 566          */
 567         Latch           recoveryWakeupLatch;
 568
 569         /*
 570          * During recovery, we keep a copy of the latest checkpoint record here.
 571          * Used by the background writer when it wants to create a restartpoint.
 572          *
 573          * Protected by info_lck.
 574          */
 575         XLogRecPtr      lastCheckPointRecPtr;
 576         CheckPoint      lastCheckPoint;
 577
 578         /*
 579          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 580          * replayed. When we're currently replaying a record, ie. in a redo
 581          * function, replayEndRecPtr points to the end+1 of the record being
 582          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 583          */
 584         XLogRecPtr      lastReplayedEndRecPtr;
 585         TimeLineID      lastReplayedTLI;
 586         XLogRecPtr      replayEndRecPtr;
 587         TimeLineID      replayEndTLI;
 588         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 589         TimestampTz recoveryLastXTime;
 590         /* current effective recovery target timeline */
 591         TimeLineID      RecoveryTargetTLI;
 592
 593         /*
 594          * timestamp of when we started replaying the current chunk of WAL data,
 595          * only relevant for replication or archive recovery
 596          */
 597         TimestampTz currentChunkStartTime;
 598         /* Are we requested to pause recovery? */
 599         bool            recoveryPause;
 600
 601         /*
 602          * lastFpwDisableRecPtr points to the start of the last replayed
 603          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 604          */
 605         XLogRecPtr      lastFpwDisableRecPtr;
 606
 607         slock_t         info_lck;               /* locks shared variables shown above */
 608 } XLogCtlData;
 609
 610 static XLogCtlData *XLogCtl = NULL;
 611
 612 /*
 613  * We maintain an image of pg_control in shared memory.
 614  */
 615 static ControlFileData *ControlFile = NULL;
 616
 617 /*
 618  * Calculate the amount of space left on the page after 'endptr'. Beware
 619  * multiple evaluation!
 620  */
 621 #define INSERT_FREESPACE(endptr)        \
 622         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 623
 624 /* Macro to advance to next buffer index. */
 625 #define NextBufIdx(idx)         \
 626                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 627
 628 /*
 629  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 630  * would hold if it was in cache, the page containing 'recptr'.
 631  */
 632 #define XLogRecPtrToBufIdx(recptr)      \
 633         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 634
 635 /*
 636  * These are the number of bytes in a WAL page and segment usable for WAL data.
 637  */
 638 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 639 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
 640
 641 /*
 642  * Private, possibly out-of-date copy of shared LogwrtResult.
 643  * See discussion above.
 644  */
 645 static XLogwrtResult LogwrtResult = {0, 0};
 646
 647 /*
 648  * Codes indicating where we got a WAL file from during recovery, or where
 649  * to attempt to get one.
 650  */
 651 typedef enum
 652 {
 653         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 654         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 655         XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
 656         XLOG_FROM_STREAM,                       /* streamed from master */
 657 } XLogSource;
 658
 659 /* human-readable names for XLogSources, for debugging output */
 660 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
 661
 662 /*
 663  * openLogFile is -1 or a kernel FD for an open log file segment.
 664  * When it's open, openLogOff is the current seek offset in the file.
 665  * openLogSegNo identifies the segment.  These variables are only
 666  * used to write the XLOG, and so will normally refer to the active segment.
 667  */
 668 static int      openLogFile = -1;
 669 static XLogSegNo openLogSegNo = 0;
 670 static uint32 openLogOff = 0;
 671
 672 /*
 673  * These variables are used similarly to the ones above, but for reading
 674  * the XLOG.  Note, however, that readOff generally represents the offset
 675  * of the page just read, not the seek position of the FD itself, which
 676  * will be just past that page. readLen indicates how much of the current
 677  * page has been read into readBuf, and readSource indicates where we got
 678  * the currently open file from.
 679  */
 680 static int      readFile = -1;
 681 static XLogSegNo readSegNo = 0;
 682 static uint32 readOff = 0;
 683 static uint32 readLen = 0;
 684 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
 685
 686 /*
 687  * Keeps track of which source we're currently reading from. This is
 688  * different from readSource in that this is always set, even when we don't
 689  * currently have a WAL file open. If lastSourceFailed is set, our last
 690  * attempt to read from currentSource failed, and we should try another source
 691  * next.
 692  */
 693 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 694 static bool lastSourceFailed = false;
 695
 696 typedef struct XLogPageReadPrivate
 697 {
 698         int                     emode;
 699         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 700         bool            randAccess;
 701 } XLogPageReadPrivate;
 702
 703 /*
 704  * These variables track when we last obtained some WAL data to process,
 705  * and where we got it from.  (XLogReceiptSource is initially the same as
 706  * readSource, but readSource gets reset to zero when we don't have data
 707  * to process right now.  It is also different from currentSource, which
 708  * also changes when we try to read from a source and fail, while
 709  * XLogReceiptSource tracks where we last successfully read some WAL.)
 710  */
 711 static TimestampTz XLogReceiptTime = 0;
 712 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
 713
 714 /* State information for XLOG reading */
 715 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 716 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 717
 718 static XLogRecPtr minRecoveryPoint;             /* local copy of
 719                                                                                  * ControlFile->minRecoveryPoint */
 720 static TimeLineID minRecoveryPointTLI;
 721 static bool updateMinRecoveryPoint = true;
 722
 723 /*
 724  * Have we reached a consistent database state? In crash recovery, we have
 725  * to replay all the WAL, so reachedConsistency is never set. During archive
 726  * recovery, the database is consistent once minRecoveryPoint is reached.
 727  */
 728 bool            reachedConsistency = false;
 729
 730 static bool InRedo = false;
 731
 732 /* Have we launched bgwriter during recovery? */
 733 static bool bgwriterLaunched = false;
 734
 735 /* For WALInsertSlotAcquire/Release functions */
 736 static int      MySlotNo = 0;
 737 static bool holdingAllSlots = false;
 738
 739 static void readRecoveryCommandFile(void);
 740 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
 741 static bool recoveryStopsBefore(XLogRecord *record);
 742 static bool recoveryStopsAfter(XLogRecord *record);
 743 static void recoveryPausesHere(void);
 744 static bool recoveryApplyDelay(XLogRecord *record);
 745 static void SetLatestXTime(TimestampTz xtime);
 746 static void SetCurrentChunkStartTime(TimestampTz xtime);
 747 static void CheckRequiredParameterValues(void);
 748 static void XLogReportParameters(void);
 749 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 750                                         TimeLineID prevTLI);
 751 static void LocalSetXLogInsertAllowed(void);
 752 static void CreateEndOfRecoveryRecord(void);
 753 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 754 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 755 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 756
 757 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
 758                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 759 static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
 760                                                  char *blk, bool get_cleanup_lock, bool keep_buffer);
 761 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 762 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 763 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 764 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 765                                            bool find_free, int *max_advance,
 766                                            bool use_lock);
 767 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 768                          int source, bool notexistOk);
 769 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 770 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 771                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 772                          TimeLineID *readTLI);
 773 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 774                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 775 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 776 static void XLogFileClose(void);
 777 static void PreallocXlogFiles(XLogRecPtr endptr);
 778 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
 779 static void UpdateLastRemovedPtr(char *filename);
 780 static void ValidateXLOGDirectoryStructure(void);
 781 static void CleanupBackupHistory(void);
 782 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 783 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 784                    int emode, bool fetching_ckpt);
 785 static void CheckRecoveryConsistency(void);
 786 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 787                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 788 static bool rescanLatestTimeLine(void);
 789 static void WriteControlFile(void);
 790 static void ReadControlFile(void);
 791 static char *str_time(pg_time_t tnow);
 792 static bool CheckForStandbyTrigger(void);
 793
 794 #ifdef WAL_DEBUG
 795 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 796 #endif
 797 static void pg_start_backup_callback(int code, Datum arg);
 798 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 799                                   bool *backupEndRequired, bool *backupFromStandby);
 800 static void rm_redo_error_callback(void *arg);
 801 static int      get_sync_bit(int method);
 802
 803 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 804                                   XLogRecData *rdata,
 805                                   XLogRecPtr StartPos, XLogRecPtr EndPos);
 806 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 807                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 808 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 809                                   XLogRecPtr *PrevPtr);
 810 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 811 static void WakeupWaiters(XLogRecPtr EndPos);
 812 static char *GetXLogBuffer(XLogRecPtr ptr);
 813 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 814 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 815 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 816
 817 static void WALInsertSlotAcquire(bool exclusive);
 818 static void WALInsertSlotAcquireOne(int slotno);
 819 static void WALInsertSlotRelease(void);
 820 static void WALInsertSlotReleaseOne(int slotno);
 821
 822 /*
 823  * Insert an XLOG record having the specified RMID and info bytes,
 824  * with the body of the record being the data chunk(s) described by
 825  * the rdata chain (see xlog.h for notes about rdata).
 826  *
 827  * Returns XLOG pointer to end of record (beginning of next record).
 828  * This can be used as LSN for data pages affected by the logged action.
 829  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 830  * before the data page can be written out.  This implements the basic
 831  * WAL rule "write the log before the data".)
 832  *
 833  * NB: this routine feels free to scribble on the XLogRecData structs,
 834  * though not on the data they reference.  This is OK since the XLogRecData
 835  * structs are always just temporaries in the calling code.
 836  */
 837 XLogRecPtr
 838 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 839 {
 840         XLogCtlInsert *Insert = &XLogCtl->Insert;
 841         XLogRecData *rdt;
 842         XLogRecData *rdt_lastnormal;
 843         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 844         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 845         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 846         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 847         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 848         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 849         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 850         XLogRecData hdr_rdt;
 851         pg_crc32        rdata_crc;
 852         uint32          len,
 853                                 write_len;
 854         unsigned        i;
 855         bool            doPageWrites;
 856         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 857         bool            inserted;
 858         uint8           info_orig = info;
 859         static XLogRecord *rechdr;
 860         XLogRecPtr      StartPos;
 861         XLogRecPtr      EndPos;
 862
 863         if (rechdr == NULL)
 864         {
 865                 rechdr = malloc(SizeOfXLogRecord);
 866                 if (rechdr == NULL)
 867                         elog(ERROR, "out of memory");
 868                 MemSet(rechdr, 0, SizeOfXLogRecord);
 869         }
 870
 871         /* cross-check on whether we should be here or not */
 872         if (!XLogInsertAllowed())
 873                 elog(ERROR, "cannot make new WAL entries during recovery");
 874
 875         /* info's high bits are reserved for use by me */
 876         if (info & XLR_INFO_MASK)
 877                 elog(PANIC, "invalid xlog info mask %02X", info);
 878
 879         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 880
 881         /*
 882          * In bootstrap mode, we don't actually log anything but XLOG resources;
 883          * return a phony record pointer.
 884          */
 885         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 886         {
 887                 EndPos = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 888                 return EndPos;
 889         }
 890
 891         /*
 892          * Here we scan the rdata chain, to determine which buffers must be backed
 893          * up.
 894          *
 895          * We may have to loop back to here if a race condition is detected below.
 896          * We could prevent the race by doing all this work while holding an
 897          * insertion slot, but it seems better to avoid doing CRC calculations
 898          * while holding one.
 899          *
 900          * We add entries for backup blocks to the chain, so that they don't need
 901          * any special treatment in the critical section where the chunks are
 902          * copied into the WAL buffers. Those entries have to be unlinked from the
 903          * chain if we have to loop back here.
 904          */
 905 begin:;
 906         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 907         {
 908                 dtbuf[i] = InvalidBuffer;
 909                 dtbuf_bkp[i] = false;
 910         }
 911
 912         /*
 913          * Decide if we need to do full-page writes in this XLOG record: true if
 914          * full_page_writes is on or we have a PITR request for it.  Since we
 915          * don't yet have an insertion slot, fullPageWrites and forcePageWrites
 916          * could change under us, but we'll recheck them once we have a slot.
 917          */
 918         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 919
 920         len = 0;
 921         for (rdt = rdata;;)
 922         {
 923                 if (rdt->buffer == InvalidBuffer)
 924                 {
 925                         /* Simple data, just include it */
 926                         len += rdt->len;
 927                 }
 928                 else
 929                 {
 930                         /* Find info for buffer */
 931                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 932                         {
 933                                 if (rdt->buffer == dtbuf[i])
 934                                 {
 935                                         /* Buffer already referenced by earlier chain item */
 936                                         if (dtbuf_bkp[i])
 937                                         {
 938                                                 rdt->data = NULL;
 939                                                 rdt->len = 0;
 940                                         }
 941                                         else if (rdt->data)
 942                                                 len += rdt->len;
 943                                         break;
 944                                 }
 945                                 if (dtbuf[i] == InvalidBuffer)
 946                                 {
 947                                         /* OK, put it in this slot */
 948                                         dtbuf[i] = rdt->buffer;
 949                                         if (doPageWrites && XLogCheckBuffer(rdt, true,
 950                                                                                    &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 951                                         {
 952                                                 dtbuf_bkp[i] = true;
 953                                                 rdt->data = NULL;
 954                                                 rdt->len = 0;
 955                                         }
 956                                         else if (rdt->data)
 957                                                 len += rdt->len;
 958                                         break;
 959                                 }
 960                         }
 961                         if (i >= XLR_MAX_BKP_BLOCKS)
 962                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 963                                          XLR_MAX_BKP_BLOCKS);
 964                 }
 965                 /* Break out of loop when rdt points to last chain item */
 966                 if (rdt->next == NULL)
 967                         break;
 968                 rdt = rdt->next;
 969         }
 970
 971         /*
 972          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 973          * error checking in ReadRecord.  This means that all callers of
 974          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 975          * make an exception for XLOG SWITCH records because we don't want them to
 976          * ever cross a segment boundary.
 977          */
 978         if (len == 0 && !isLogSwitch)
 979                 elog(PANIC, "invalid xlog record length %u", len);
 980
 981         /*
 982          * Make additional rdata chain entries for the backup blocks, so that we
 983          * don't need to special-case them in the write loop.  This modifies the
 984          * original rdata chain, but we keep a pointer to the last regular entry,
 985          * rdt_lastnormal, so that we can undo this if we have to loop back to the
 986          * beginning.
 987          *
 988          * At the exit of this loop, write_len includes the backup block data.
 989          *
 990          * Also set the appropriate info bits to show which buffers were backed
 991          * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
 992          * value (ignoring InvalidBuffer) appearing in the rdata chain.
 993          */
 994         rdt_lastnormal = rdt;
 995         write_len = len;
 996         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 997         {
 998                 BkpBlock   *bkpb;
 999                 char       *page;
1000
1001                 if (!dtbuf_bkp[i])
1002                         continue;
1003
1004                 info |= XLR_BKP_BLOCK(i);
1005
1006                 bkpb = &(dtbuf_xlg[i]);
1007                 page = (char *) BufferGetBlock(dtbuf[i]);
1008
1009                 rdt->next = &(dtbuf_rdt1[i]);
1010                 rdt = rdt->next;
1011
1012                 rdt->data = (char *) bkpb;
1013                 rdt->len = sizeof(BkpBlock);
1014                 write_len += sizeof(BkpBlock);
1015
1016                 rdt->next = &(dtbuf_rdt2[i]);
1017                 rdt = rdt->next;
1018
1019                 if (bkpb->hole_length == 0)
1020                 {
1021                         rdt->data = page;
1022                         rdt->len = BLCKSZ;
1023                         write_len += BLCKSZ;
1024                         rdt->next = NULL;
1025                 }
1026                 else
1027                 {
1028                         /* must skip the hole */
1029                         rdt->data = page;
1030                         rdt->len = bkpb->hole_offset;
1031                         write_len += bkpb->hole_offset;
1032
1033                         rdt->next = &(dtbuf_rdt3[i]);
1034                         rdt = rdt->next;
1035
1036                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
1037                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
1038                         write_len += rdt->len;
1039                         rdt->next = NULL;
1040                 }
1041         }
1042
1043         /*
1044          * Calculate CRC of the data, including all the backup blocks
1045          *
1046          * Note that the record header isn't added into the CRC initially since we
1047          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
1048          * the whole record in the order: rdata, then backup blocks, then record
1049          * header.
1050          */
1051         INIT_CRC32(rdata_crc);
1052         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
1053                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
1054
1055         /*
1056          * Construct record header (prev-link is filled in later, after reserving
1057          * the space for the record), and make that the first chunk in the chain.
1058          *
1059          * The CRC calculated for the header here doesn't include prev-link,
1060          * because we don't know it yet. It will be added later.
1061          */
1062         rechdr->xl_xid = GetCurrentTransactionIdIfAny();
1063         rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
1064         rechdr->xl_len = len;           /* doesn't include backup blocks */
1065         rechdr->xl_info = info;
1066         rechdr->xl_rmid = rmid;
1067         rechdr->xl_prev = InvalidXLogRecPtr;
1068         COMP_CRC32(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev));
1069
1070         hdr_rdt.next = rdata;
1071         hdr_rdt.data = (char *) rechdr;
1072         hdr_rdt.len = SizeOfXLogRecord;
1073         write_len += SizeOfXLogRecord;
1074
1075         /*----------
1076          *
1077          * We have now done all the preparatory work we can without holding a
1078          * lock or modifying shared state. From here on, inserting the new WAL
1079          * record to the shared WAL buffer cache is a two-step process:
1080          *
1081          * 1. Reserve the right amount of space from the WAL. The current head of
1082          *    reserved space is kept in Insert->CurrBytePos, and is protected by
1083          *    insertpos_lck.
1084          *
1085          * 2. Copy the record to the reserved WAL space. This involves finding the
1086          *    correct WAL buffer containing the reserved space, and copying the
1087          *    record in place. This can be done concurrently in multiple processes.
1088          *
1089          * To keep track of which insertions are still in-progress, each concurrent
1090          * inserter allocates an "insertion slot", which tells others how far the
1091          * inserter has progressed. There is a small fixed number of insertion
1092          * slots, determined by the num_xloginsert_slots GUC. When an inserter
1093          * finishes, it updates the xlogInsertingAt of its slot to the end of the
1094          * record it inserted, to let others know that it's done. xlogInsertingAt
1095          * is also updated when crossing over to a new WAL buffer, to allow the
1096          * the previous buffer to be flushed.
1097          *
1098          * Holding onto a slot also protects RedoRecPtr and fullPageWrites from
1099          * changing until the insertion is finished.
1100          *
1101          * Step 2 can usually be done completely in parallel. If the required WAL
1102          * page is not initialized yet, you have to grab WALBufMappingLock to
1103          * initialize it, but the WAL writer tries to do that ahead of insertions
1104          * to avoid that from happening in the critical path.
1105          *
1106          *----------
1107          */
1108         START_CRIT_SECTION();
1109         WALInsertSlotAcquire(isLogSwitch);
1110
1111         /*
1112          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
1113          * back and recompute everything.  This can only happen just after a
1114          * checkpoint, so it's better to be slow in this case and fast otherwise.
1115          *
1116          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1117          * affect the contents of the XLOG record, so we'll update our local copy
1118          * but not force a recomputation.
1119          */
1120         if (RedoRecPtr != Insert->RedoRecPtr)
1121         {
1122                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1123                 RedoRecPtr = Insert->RedoRecPtr;
1124
1125                 if (doPageWrites)
1126                 {
1127                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1128                         {
1129                                 if (dtbuf[i] == InvalidBuffer)
1130                                         continue;
1131                                 if (dtbuf_bkp[i] == false &&
1132                                         dtbuf_lsn[i] <= RedoRecPtr)
1133                                 {
1134                                         /*
1135                                          * Oops, this buffer now needs to be backed up, but we
1136                                          * didn't think so above.  Start over.
1137                                          */
1138                                         WALInsertSlotRelease();
1139                                         END_CRIT_SECTION();
1140                                         rdt_lastnormal->next = NULL;
1141                                         info = info_orig;
1142                                         goto begin;
1143                                 }
1144                         }
1145                 }
1146         }
1147
1148         /*
1149          * Also check to see if fullPageWrites or forcePageWrites was just turned
1150          * on; if we weren't already doing full-page writes then go back and
1151          * recompute. (If it was just turned off, we could recompute the record
1152          * without full pages, but we choose not to bother.)
1153          */
1154         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
1155         {
1156                 /* Oops, must redo it with full-page data. */
1157                 WALInsertSlotRelease();
1158                 END_CRIT_SECTION();
1159                 rdt_lastnormal->next = NULL;
1160                 info = info_orig;
1161                 goto begin;
1162         }
1163
1164         /*
1165          * Reserve space for the record in the WAL. This also sets the xl_prev
1166          * pointer.
1167          */
1168         if (isLogSwitch)
1169                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1170         else
1171         {
1172                 ReserveXLogInsertLocation(write_len, &StartPos, &EndPos,
1173                                                                   &rechdr->xl_prev);
1174                 inserted = true;
1175         }
1176
1177         if (inserted)
1178         {
1179                 /*
1180                  * Now that xl_prev has been filled in, finish CRC calculation of the
1181                  * record header.
1182                  */
1183                 COMP_CRC32(rdata_crc, ((char *) &rechdr->xl_prev), sizeof(XLogRecPtr));
1184                 FIN_CRC32(rdata_crc);
1185                 rechdr->xl_crc = rdata_crc;
1186
1187                 /*
1188                  * All the record data, including the header, is now ready to be
1189                  * inserted. Copy the record in the space reserved.
1190                  */
1191                 CopyXLogRecordToWAL(write_len, isLogSwitch, &hdr_rdt, StartPos, EndPos);
1192         }
1193         else
1194         {
1195                 /*
1196                  * This was an xlog-switch record, but the current insert location was
1197                  * already exactly at the beginning of a segment, so there was no need
1198                  * to do anything.
1199                  */
1200         }
1201
1202         /*
1203          * Done! Let others know that we're finished.
1204          */
1205         WALInsertSlotRelease();
1206
1207         MarkCurrentTransactionIdLoggedIfAny();
1208
1209         END_CRIT_SECTION();
1210
1211         /*
1212          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1213          */
1214         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1215         {
1216                 /* use volatile pointer to prevent code rearrangement */
1217                 volatile XLogCtlData *xlogctl = XLogCtl;
1218
1219                 SpinLockAcquire(&xlogctl->info_lck);
1220                 /* advance global request to include new block(s) */
1221                 if (xlogctl->LogwrtRqst.Write < EndPos)
1222                         xlogctl->LogwrtRqst.Write = EndPos;
1223                 /* update local result copy while I have the chance */
1224                 LogwrtResult = xlogctl->LogwrtResult;
1225                 SpinLockRelease(&xlogctl->info_lck);
1226         }
1227
1228         /*
1229          * If this was an XLOG_SWITCH record, flush the record and the empty
1230          * padding space that fills the rest of the segment, and perform
1231          * end-of-segment actions (eg, notifying archiver).
1232          */
1233         if (isLogSwitch)
1234         {
1235                 TRACE_POSTGRESQL_XLOG_SWITCH();
1236                 XLogFlush(EndPos);
1237                 /*
1238                  * Even though we reserved the rest of the segment for us, which is
1239                  * reflected in EndPos, we return a pointer to just the end of the
1240                  * xlog-switch record.
1241                  */
1242                 if (inserted)
1243                 {
1244                         EndPos = StartPos + SizeOfXLogRecord;
1245                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1246                         {
1247                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1248                                         EndPos += SizeOfXLogLongPHD;
1249                                 else
1250                                         EndPos += SizeOfXLogShortPHD;
1251                         }
1252                 }
1253         }
1254
1255 #ifdef WAL_DEBUG
1256         if (XLOG_DEBUG)
1257         {
1258                 StringInfoData buf;
1259
1260                 initStringInfo(&buf);
1261                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1262                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1263                 xlog_outrec(&buf, rechdr);
1264                 if (rdata->data != NULL)
1265                 {
1266                         appendStringInfoString(&buf, " - ");
1267                         RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
1268                 }
1269                 elog(LOG, "%s", buf.data);
1270                 pfree(buf.data);
1271         }
1272 #endif
1273
1274         /*
1275          * Update our global variables
1276          */
1277         ProcLastRecPtr = StartPos;
1278         XactLastRecEnd = EndPos;
1279
1280         return EndPos;
1281 }
1282
1283 /*
1284  * Reserves the right amount of space for a record of given size from the WAL.
1285  * *StartPos is set to the beginning of the reserved section, *EndPos to
1286  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1287  * used to set the xl_prev of this record.
1288  *
1289  * This is the performance critical part of XLogInsert that must be serialized
1290  * across backends. The rest can happen mostly in parallel. Try to keep this
1291  * section as short as possible, insertpos_lck can be heavily contended on a
1292  * busy system.
1293  *
1294  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1295  * where we actually copy the record to the reserved space.
1296  */
1297 static void
1298 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1299                                                   XLogRecPtr *PrevPtr)
1300 {
1301         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1302         uint64          startbytepos;
1303         uint64          endbytepos;
1304         uint64          prevbytepos;
1305
1306         size = MAXALIGN(size);
1307
1308         /* All (non xlog-switch) records should contain data. */
1309         Assert(size > SizeOfXLogRecord);
1310
1311         /*
1312          * The duration the spinlock needs to be held is minimized by minimizing
1313          * the calculations that have to be done while holding the lock. The
1314          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1315          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1316          * page headers. The mapping between "usable" byte positions and physical
1317          * positions (XLogRecPtrs) can be done outside the locked region, and
1318          * because the usable byte position doesn't include any headers, reserving
1319          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1320          */
1321         SpinLockAcquire(&Insert->insertpos_lck);
1322
1323         startbytepos = Insert->CurrBytePos;
1324         endbytepos = startbytepos + size;
1325         prevbytepos = Insert->PrevBytePos;
1326         Insert->CurrBytePos = endbytepos;
1327         Insert->PrevBytePos = startbytepos;
1328
1329         SpinLockRelease(&Insert->insertpos_lck);
1330
1331         *StartPos = XLogBytePosToRecPtr(startbytepos);
1332         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1333         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1334
1335         /*
1336          * Check that the conversions between "usable byte positions" and
1337          * XLogRecPtrs work consistently in both directions.
1338          */
1339         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1340         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1341         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1342 }
1343
1344 /*
1345  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1346  *
1347  * A log-switch record is handled slightly differently. The rest of the
1348  * segment will be reserved for this insertion, as indicated by the returned
1349  * *EndPos value. However, if we are already at the beginning of the current
1350  * segment, *StartPos and *EndPos are set to the current location without
1351  * reserving any space, and the function returns false.
1352 */
1353 static bool
1354 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1355 {
1356         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1357         uint64          startbytepos;
1358         uint64          endbytepos;
1359         uint64          prevbytepos;
1360         uint32          size = SizeOfXLogRecord;
1361         XLogRecPtr      ptr;
1362         uint32          segleft;
1363
1364         /*
1365          * These calculations are a bit heavy-weight to be done while holding a
1366          * spinlock, but since we're holding all the WAL insertion slots, there
1367          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1368          * compete for it, but that's not called very frequently.
1369          */
1370         SpinLockAcquire(&Insert->insertpos_lck);
1371
1372         startbytepos = Insert->CurrBytePos;
1373
1374         ptr = XLogBytePosToEndRecPtr(startbytepos);
1375         if (ptr % XLOG_SEG_SIZE == 0)
1376         {
1377                 SpinLockRelease(&Insert->insertpos_lck);
1378                 *EndPos = *StartPos = ptr;
1379                 return false;
1380         }
1381
1382         endbytepos = startbytepos + size;
1383         prevbytepos = Insert->PrevBytePos;
1384
1385         *StartPos = XLogBytePosToRecPtr(startbytepos);
1386         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1387
1388         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1389         if (segleft != XLOG_SEG_SIZE)
1390         {
1391                 /* consume the rest of the segment */
1392                 *EndPos += segleft;
1393                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1394         }
1395         Insert->CurrBytePos = endbytepos;
1396         Insert->PrevBytePos = startbytepos;
1397
1398         SpinLockRelease(&Insert->insertpos_lck);
1399
1400         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1401
1402         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1403         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1404         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1405         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1406
1407         return true;
1408 }
1409
1410 /*
1411  * Subroutine of XLogInsert.  Copies a WAL record to an already-reserved
1412  * area in the WAL.
1413  */
1414 static void
1415 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1416                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1417 {
1418         char       *currpos;
1419         int                     freespace;
1420         int                     written;
1421         XLogRecPtr      CurrPos;
1422         XLogPageHeader pagehdr;
1423
1424         /* The first chunk is the record header */
1425         Assert(rdata->len == SizeOfXLogRecord);
1426
1427         /*
1428          * Get a pointer to the right place in the right WAL buffer to start
1429          * inserting to.
1430          */
1431         CurrPos = StartPos;
1432         currpos = GetXLogBuffer(CurrPos);
1433         freespace = INSERT_FREESPACE(CurrPos);
1434
1435         /*
1436          * there should be enough space for at least the first field (xl_tot_len)
1437          * on this page.
1438          */
1439         Assert(freespace >= sizeof(uint32));
1440
1441         /* Copy record data */
1442         written = 0;
1443         while (rdata != NULL)
1444         {
1445                 char       *rdata_data = rdata->data;
1446                 int                     rdata_len = rdata->len;
1447
1448                 while (rdata_len > freespace)
1449                 {
1450                         /*
1451                          * Write what fits on this page, and continue on the next page.
1452                          */
1453                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1454                         memcpy(currpos, rdata_data, freespace);
1455                         rdata_data += freespace;
1456                         rdata_len -= freespace;
1457                         written += freespace;
1458                         CurrPos += freespace;
1459
1460                         /*
1461                          * Get pointer to beginning of next page, and set the xlp_rem_len
1462                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1463                          *
1464                          * It's safe to set the contrecord flag and xlp_rem_len without a
1465                          * lock on the page. All the other flags were already set when the
1466                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1467                          * only backend that needs to set the contrecord flag.
1468                          */
1469                         currpos = GetXLogBuffer(CurrPos);
1470                         pagehdr = (XLogPageHeader) currpos;
1471                         pagehdr->xlp_rem_len = write_len - written;
1472                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1473
1474                         /* skip over the page header */
1475                         if (CurrPos % XLogSegSize == 0)
1476                         {
1477                                 CurrPos += SizeOfXLogLongPHD;
1478                                 currpos += SizeOfXLogLongPHD;
1479                         }
1480                         else
1481                         {
1482                                 CurrPos += SizeOfXLogShortPHD;
1483                                 currpos += SizeOfXLogShortPHD;
1484                         }
1485                         freespace = INSERT_FREESPACE(CurrPos);
1486                 }
1487
1488                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1489                 memcpy(currpos, rdata_data, rdata_len);
1490                 currpos += rdata_len;
1491                 CurrPos += rdata_len;
1492                 freespace -= rdata_len;
1493                 written += rdata_len;
1494
1495                 rdata = rdata->next;
1496         }
1497         Assert(written == write_len);
1498
1499         /* Align the end position, so that the next record starts aligned */
1500         CurrPos = MAXALIGN64(CurrPos);
1501
1502         /*
1503          * If this was an xlog-switch, it's not enough to write the switch record,
1504          * we also have to consume all the remaining space in the WAL segment.
1505          * We have already reserved it for us, but we still need to make sure it's
1506          * allocated and zeroed in the WAL buffers so that when the caller (or
1507          * someone else) does XLogWrite(), it can really write out all the zeros.
1508          */
1509         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1510         {
1511                 /* An xlog-switch record doesn't contain any data besides the header */
1512                 Assert(write_len == SizeOfXLogRecord);
1513
1514                 /*
1515                  * We do this one page at a time, to make sure we don't deadlock
1516                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1517                  */
1518                 Assert(EndPos % XLogSegSize == 0);
1519
1520                 /* Use up all the remaining space on the first page */
1521                 CurrPos += freespace;
1522
1523                 while (CurrPos < EndPos)
1524                 {
1525                         /* initialize the next page (if not initialized already) */
1526                         WakeupWaiters(CurrPos);
1527                         AdvanceXLInsertBuffer(CurrPos, false);
1528                         CurrPos += XLOG_BLCKSZ;
1529                 }
1530         }
1531
1532         if (CurrPos != EndPos)
1533                 elog(PANIC, "space reserved for WAL record does not match what was written");
1534 }
1535
1536 /*
1537  * Allocate a slot for insertion.
1538  *
1539  * In exclusive mode, all slots are reserved for the current process. That
1540  * blocks all concurrent insertions.
1541  */
1542 static void
1543 WALInsertSlotAcquire(bool exclusive)
1544 {
1545         int                     i;
1546
1547         if (exclusive)
1548         {
1549                 for (i = 0; i < num_xloginsert_slots; i++)
1550                         WALInsertSlotAcquireOne(i);
1551                 holdingAllSlots = true;
1552         }
1553         else
1554                 WALInsertSlotAcquireOne(-1);
1555 }
1556
1557 /*
1558  * Workhorse of WALInsertSlotAcquire. Acquires the given slot, or an arbitrary
1559  * one if slotno == -1. The index of the slot that was acquired is stored in
1560  * MySlotNo.
1561  *
1562  * This is more or less equivalent to LWLockAcquire().
1563  */
1564 static void
1565 WALInsertSlotAcquireOne(int slotno)
1566 {
1567         volatile XLogInsertSlot *slot;
1568         PGPROC     *proc = MyProc;
1569         bool            retry = false;
1570         int                     extraWaits = 0;
1571         static int      slotToTry = -1;
1572
1573         /*
1574          * Try to use the slot we used last time. If the system isn't particularly
1575          * busy, it's a good bet that it's available, and it's good to have some
1576          * affinity to a particular slot so that you don't unnecessarily bounce
1577          * cache lines between processes when there is no contention.
1578          *
1579          * If this is the first time through in this backend, pick a slot
1580          * (semi-)randomly. This allows the slots to be used evenly if you have a
1581          * lot of very short connections.
1582          */
1583         if (slotno != -1)
1584                 MySlotNo = slotno;
1585         else
1586         {
1587                 if (slotToTry == -1)
1588                         slotToTry = MyProc->pgprocno % num_xloginsert_slots;
1589                 MySlotNo = slotToTry;
1590         }
1591
1592         /*
1593          * We can't wait if we haven't got a PGPROC.  This should only occur
1594          * during bootstrap or shared memory initialization.  Put an Assert here
1595          * to catch unsafe coding practices.
1596          */
1597         Assert(MyProc != NULL);
1598
1599         /*
1600          * Lock out cancel/die interrupts until we exit the code section protected
1601          * by the slot.  This ensures that interrupts will not interfere with
1602          * manipulations of data structures in shared memory. There is no cleanup
1603          * mechanism to release the slot if the backend dies while holding one,
1604          * so make this a critical section.
1605          */
1606         START_CRIT_SECTION();
1607
1608         /*
1609          * Loop here to try to acquire slot after each time we are signaled by
1610          * WALInsertSlotRelease.
1611          */
1612         for (;;)
1613         {
1614                 bool            mustwait;
1615
1616                 slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1617
1618                 /* Acquire mutex.  Time spent holding mutex should be short! */
1619                 SpinLockAcquire(&slot->mutex);
1620
1621                 /* If retrying, allow WALInsertSlotRelease to release waiters again */
1622                 if (retry)
1623                         slot->releaseOK = true;
1624
1625                 /* If I can get the slot, do so quickly. */
1626                 if (slot->exclusive == 0)
1627                 {
1628                         slot->exclusive++;
1629                         mustwait = false;
1630                 }
1631                 else
1632                         mustwait = true;
1633
1634                 if (!mustwait)
1635                         break;                          /* got the lock */
1636
1637                 Assert(slot->owner != MyProc);
1638
1639                 /*
1640                  * Add myself to wait queue.
1641                  */
1642                 proc->lwWaiting = true;
1643                 proc->lwWaitMode = LW_EXCLUSIVE;
1644                 proc->lwWaitLink = NULL;
1645                 if (slot->head == NULL)
1646                         slot->head = proc;
1647                 else
1648                         slot->tail->lwWaitLink = proc;
1649                 slot->tail = proc;
1650
1651                 /* Can release the mutex now */
1652                 SpinLockRelease(&slot->mutex);
1653
1654                 /*
1655                  * Wait until awakened.
1656                  *
1657                  * Since we share the process wait semaphore with the regular lock
1658                  * manager and ProcWaitForSignal, and we may need to acquire a slot
1659                  * while one of those is pending, it is possible that we get awakened
1660                  * for a reason other than being signaled by WALInsertSlotRelease. If
1661                  * so, loop back and wait again.  Once we've gotten the slot,
1662                  * re-increment the sema by the number of additional signals received,
1663                  * so that the lock manager or signal manager will see the received
1664                  * signal when it next waits.
1665                  */
1666                 for (;;)
1667                 {
1668                         /* "false" means cannot accept cancel/die interrupt here. */
1669                         PGSemaphoreLock(&proc->sem, false);
1670                         if (!proc->lwWaiting)
1671                                 break;
1672                         extraWaits++;
1673                 }
1674
1675                 /* Now loop back and try to acquire lock again. */
1676                 retry = true;
1677         }
1678
1679         slot->owner = proc;
1680
1681         /*
1682          * Normally, we initialize the xlogInsertingAt value of the slot to 1,
1683          * because we don't yet know where in the WAL we're going to insert. It's
1684          * not critical what it points to right now - leaving it to a too small
1685          * value just means that WaitXlogInsertionsToFinish() might wait on us
1686          * unnecessarily, until we update the value (when we finish the insert or
1687          * move to next page).
1688          *
1689          * If we're grabbing all the slots, however, stamp all but the last one
1690          * with InvalidXLogRecPtr, meaning there is no insert in progress. The last
1691          * slot is the one that we will update as we proceed with the insert, the
1692          * rest are held just to keep off other inserters.
1693          */
1694         if (slotno != -1 && slotno != num_xloginsert_slots - 1)
1695                 slot->xlogInsertingAt = InvalidXLogRecPtr;
1696         else
1697                 slot->xlogInsertingAt = 1;
1698
1699         /* We are done updating shared state of the slot itself. */
1700         SpinLockRelease(&slot->mutex);
1701
1702         /*
1703          * Fix the process wait semaphore's count for any absorbed wakeups.
1704          */
1705         while (extraWaits-- > 0)
1706                 PGSemaphoreUnlock(&proc->sem);
1707
1708         /*
1709          * If we couldn't get the slot immediately, try another slot next time.
1710          * On a system with more insertion slots than concurrent inserters, this
1711          * causes all the inserters to eventually migrate to a slot that no-one
1712          * else is using. On a system with more inserters than slots, it still
1713          * causes the inserters to be distributed quite evenly across the slots.
1714          */
1715         if (slotno != -1 && retry)
1716                 slotToTry = (slotToTry + 1) % num_xloginsert_slots;
1717 }
1718
1719 /*
1720  * Wait for the given slot to become free, or for its xlogInsertingAt location
1721  * to change to something else than 'waitptr'. In other words, wait for the
1722  * inserter using the given slot to finish its insertion, or to at least make
1723  * some progress.
1724  */
1725 static void
1726 WaitOnSlot(volatile XLogInsertSlot *slot, XLogRecPtr waitptr)
1727 {
1728         PGPROC     *proc = MyProc;
1729         int                     extraWaits = 0;
1730
1731         /*
1732          * Lock out cancel/die interrupts while we sleep on the slot. There is
1733          * no cleanup mechanism to remove us from the wait queue if we got
1734          * interrupted.
1735          */
1736         HOLD_INTERRUPTS();
1737
1738         /*
1739          * Loop here to try to acquire lock after each time we are signaled.
1740          */
1741         for (;;)
1742         {
1743                 bool            mustwait;
1744
1745                 /* Acquire mutex.  Time spent holding mutex should be short! */
1746                 SpinLockAcquire(&slot->mutex);
1747
1748                 /* If I can get the lock, do so quickly. */
1749                 if (slot->exclusive == 0 || slot->xlogInsertingAt != waitptr)
1750                         mustwait = false;
1751                 else
1752                         mustwait = true;
1753
1754                 if (!mustwait)
1755                         break;                          /* the lock was free */
1756
1757                 Assert(slot->owner != MyProc);
1758
1759                 /*
1760                  * Add myself to wait queue.
1761                  */
1762                 proc->lwWaiting = true;
1763                 proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
1764                 proc->lwWaitLink = NULL;
1765
1766                 /* waiters are added to the front of the queue */
1767                 proc->lwWaitLink = slot->head;
1768                 if (slot->head == NULL)
1769                         slot->tail = proc;
1770                 slot->head = proc;
1771
1772                 /* Can release the mutex now */
1773                 SpinLockRelease(&slot->mutex);
1774
1775                 /*
1776                  * Wait until awakened.
1777                  *
1778                  * Since we share the process wait semaphore with other things, like
1779                  * the regular lock manager and ProcWaitForSignal, and we may need to
1780                  * acquire an LWLock while one of those is pending, it is possible that
1781                  * we get awakened for a reason other than being signaled by
1782                  * LWLockRelease. If so, loop back and wait again.  Once we've gotten
1783                  * the LWLock, re-increment the sema by the number of additional
1784                  * signals received, so that the lock manager or signal manager will
1785                  * see the received signal when it next waits.
1786                  */
1787                 for (;;)
1788                 {
1789                         /* "false" means cannot accept cancel/die interrupt here. */
1790                         PGSemaphoreLock(&proc->sem, false);
1791                         if (!proc->lwWaiting)
1792                                 break;
1793                         extraWaits++;
1794                 }
1795
1796                 /* Now loop back and try to acquire lock again. */
1797         }
1798
1799         /* We are done updating shared state of the lock itself. */
1800         SpinLockRelease(&slot->mutex);
1801
1802         /*
1803          * Fix the process wait semaphore's count for any absorbed wakeups.
1804          */
1805         while (extraWaits-- > 0)
1806                 PGSemaphoreUnlock(&proc->sem);
1807
1808         /*
1809          * Now okay to allow cancel/die interrupts.
1810          */
1811         RESUME_INTERRUPTS();
1812 }
1813
1814 /*
1815  * Wake up all processes waiting for us with WaitOnSlot(). Sets our
1816  * xlogInsertingAt value to EndPos, without releasing the slot.
1817  */
1818 static void
1819 WakeupWaiters(XLogRecPtr EndPos)
1820 {
1821         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1822         PGPROC     *head;
1823         PGPROC     *proc;
1824         PGPROC     *next;
1825
1826         /*
1827          * If we have already reported progress up to the same point, do nothing.
1828          * No other process can modify xlogInsertingAt, so we can check this before
1829          * grabbing the spinlock.
1830          */
1831         if (slot->xlogInsertingAt == EndPos)
1832                 return;
1833         /* xlogInsertingAt should not go backwards */
1834         Assert(slot->xlogInsertingAt < EndPos);
1835
1836         /* Acquire mutex.  Time spent holding mutex should be short! */
1837         SpinLockAcquire(&slot->mutex);
1838
1839         /* we should own the slot */
1840         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1841
1842         slot->xlogInsertingAt = EndPos;
1843
1844         /*
1845          * See if there are any waiters that need to be woken up.
1846          */
1847         head = slot->head;
1848
1849         if (head != NULL)
1850         {
1851                 proc = head;
1852
1853                 /* LW_WAIT_UNTIL_FREE waiters are always in the front of the queue */
1854                 next = proc->lwWaitLink;
1855                 while (next && next->lwWaitMode == LW_WAIT_UNTIL_FREE)
1856                 {
1857                         proc = next;
1858                         next = next->lwWaitLink;
1859                 }
1860
1861                 /* proc is now the last PGPROC to be released */
1862                 slot->head = next;
1863                 proc->lwWaitLink = NULL;
1864         }
1865
1866         /* We are done updating shared state of the lock itself. */
1867         SpinLockRelease(&slot->mutex);
1868
1869         /*
1870          * Awaken any waiters I removed from the queue.
1871          */
1872         while (head != NULL)
1873         {
1874                 proc = head;
1875                 head = proc->lwWaitLink;
1876                 proc->lwWaitLink = NULL;
1877                 proc->lwWaiting = false;
1878                 PGSemaphoreUnlock(&proc->sem);
1879         }
1880 }
1881
1882 /*
1883  * Release our insertion slot (or slots, if we're holding them all).
1884  */
1885 static void
1886 WALInsertSlotRelease(void)
1887 {
1888         int                     i;
1889
1890         if (holdingAllSlots)
1891         {
1892                 for (i = 0; i < num_xloginsert_slots; i++)
1893                         WALInsertSlotReleaseOne(i);
1894                 holdingAllSlots = false;
1895         }
1896         else
1897                 WALInsertSlotReleaseOne(MySlotNo);
1898 }
1899
1900 static void
1901 WALInsertSlotReleaseOne(int slotno)
1902 {
1903         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[slotno].slot;
1904         PGPROC     *head;
1905         PGPROC     *proc;
1906
1907         /* Acquire mutex.  Time spent holding mutex should be short! */
1908         SpinLockAcquire(&slot->mutex);
1909
1910         /* we must be holding it */
1911         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1912
1913         slot->xlogInsertingAt = InvalidXLogRecPtr;
1914
1915         /* Release my hold on the slot */
1916         slot->exclusive = 0;
1917         slot->owner = NULL;
1918
1919         /*
1920          * See if I need to awaken any waiters..
1921          */
1922         head = slot->head;
1923         if (head != NULL)
1924         {
1925                 if (slot->releaseOK)
1926                 {
1927                         /*
1928                          * Remove the to-be-awakened PGPROCs from the queue.
1929                          */
1930                         bool            releaseOK = true;
1931
1932                         proc = head;
1933
1934                         /*
1935                          * First wake up any backends that want to be woken up without
1936                          * acquiring the lock. These are always in the front of the queue.
1937                          */
1938                         while (proc->lwWaitMode == LW_WAIT_UNTIL_FREE && proc->lwWaitLink)
1939                                 proc = proc->lwWaitLink;
1940
1941                         /*
1942                          * Awaken the first exclusive-waiter, if any.
1943                          */
1944                         if (proc->lwWaitLink)
1945                         {
1946                                 Assert(proc->lwWaitLink->lwWaitMode == LW_EXCLUSIVE);
1947                                 proc = proc->lwWaitLink;
1948                                 releaseOK = false;
1949                         }
1950                         /* proc is now the last PGPROC to be released */
1951                         slot->head = proc->lwWaitLink;
1952                         proc->lwWaitLink = NULL;
1953
1954                         slot->releaseOK = releaseOK;
1955                 }
1956                 else
1957                         head = NULL;
1958         }
1959
1960         /* We are done updating shared state of the slot itself. */
1961         SpinLockRelease(&slot->mutex);
1962
1963         /*
1964          * Awaken any waiters I removed from the queue.
1965          */
1966         while (head != NULL)
1967         {
1968                 proc = head;
1969                 head = proc->lwWaitLink;
1970                 proc->lwWaitLink = NULL;
1971                 proc->lwWaiting = false;
1972                 PGSemaphoreUnlock(&proc->sem);
1973         }
1974
1975         /*
1976          * Now okay to allow cancel/die interrupts.
1977          */
1978         END_CRIT_SECTION();
1979 }
1980
1981
1982 /*
1983  * Wait for any WAL insertions < upto to finish.
1984  *
1985  * Returns the location of the oldest insertion that is still in-progress.
1986  * Any WAL prior to that point has been fully copied into WAL buffers, and
1987  * can be flushed out to disk. Because this waits for any insertions older
1988  * than 'upto' to finish, the return value is always >= 'upto'.
1989  *
1990  * Note: When you are about to write out WAL, you must call this function
1991  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1992  * need to wait for an insertion to finish (or at least advance to next
1993  * uninitialized page), and the inserter might need to evict an old WAL buffer
1994  * to make room for a new one, which in turn requires WALWriteLock.
1995  */
1996 static XLogRecPtr
1997 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1998 {
1999         uint64          bytepos;
2000         XLogRecPtr      reservedUpto;
2001         XLogRecPtr      finishedUpto;
2002         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
2003         int                     i;
2004
2005         if (MyProc == NULL)
2006                 elog(PANIC, "cannot wait without a PGPROC structure");
2007
2008         /* Read the current insert position */
2009         SpinLockAcquire(&Insert->insertpos_lck);
2010         bytepos = Insert->CurrBytePos;
2011         SpinLockRelease(&Insert->insertpos_lck);
2012         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
2013
2014         /*
2015          * No-one should request to flush a piece of WAL that hasn't even been
2016          * reserved yet. However, it can happen if there is a block with a bogus
2017          * LSN on disk, for example. XLogFlush checks for that situation and
2018          * complains, but only after the flush. Here we just assume that to mean
2019          * that all WAL that has been reserved needs to be finished. In this
2020          * corner-case, the return value can be smaller than 'upto' argument.
2021          */
2022         if (upto > reservedUpto)
2023         {
2024                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
2025                          (uint32) (upto >> 32), (uint32) upto,
2026                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
2027                 upto = reservedUpto;
2028         }
2029
2030         /*
2031          * finishedUpto is our return value, indicating the point upto which
2032          * all the WAL insertions have been finished. Initialize it to the head
2033          * of reserved WAL, and as we iterate through the insertion slots, back it
2034          * out for any insertion that's still in progress.
2035          */
2036         finishedUpto = reservedUpto;
2037
2038         /*
2039          * Loop through all the slots, sleeping on any in-progress insert older
2040          * than 'upto'.
2041          */
2042         for (i = 0; i < num_xloginsert_slots; i++)
2043         {
2044                 volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
2045                 XLogRecPtr insertingat;
2046
2047         retry:
2048                 /*
2049                  * We can check if the slot is in use without grabbing the spinlock.
2050                  * The spinlock acquisition of insertpos_lck before this loop acts
2051                  * as a memory barrier. If someone acquires the slot after that, it
2052                  * can't possibly be inserting to anything < reservedUpto. If it was
2053                  * acquired before that, an unlocked test will return true.
2054                  */
2055                 if (!slot->exclusive)
2056                         continue;
2057
2058                 SpinLockAcquire(&slot->mutex);
2059                 /* re-check now that we have the lock */
2060                 if (!slot->exclusive)
2061                 {
2062                         SpinLockRelease(&slot->mutex);
2063                         continue;
2064                 }
2065                 insertingat = slot->xlogInsertingAt;
2066                 SpinLockRelease(&slot->mutex);
2067
2068                 if (insertingat == InvalidXLogRecPtr)
2069                 {
2070                         /*
2071                          * slot is reserved just to hold off other inserters, there is no
2072                          * actual insert in progress.
2073                          */
2074                         continue;
2075                 }
2076
2077                 /*
2078                  * This insertion is still in progress. Do we need to wait for it?
2079                  *
2080                  * When an inserter acquires a slot, it doesn't reset 'insertingat', so
2081                  * it will initially point to the old value of some already-finished
2082                  * insertion. The inserter will update the value as soon as it finishes
2083                  * the insertion, moves to the next page, or has to do I/O to flush an
2084                  * old dirty buffer. That means that when we see a slot with
2085                  * insertingat value < upto, we don't know if that insertion is still
2086                  * truly in progress, or if the slot is reused by a new inserter that
2087                  * hasn't updated the insertingat value yet. We have to assume it's the
2088                  * latter, and wait.
2089                  */
2090                 if (insertingat < upto)
2091                 {
2092                         WaitOnSlot(slot, insertingat);
2093                         goto retry;
2094                 }
2095                 else
2096                 {
2097                         /*
2098                          * We don't need to wait for this insertion, but update the
2099                          * return value.
2100                          */
2101                         if (insertingat < finishedUpto)
2102                                 finishedUpto = insertingat;
2103                 }
2104         }
2105         return finishedUpto;
2106 }
2107
2108 /*
2109  * Get a pointer to the right location in the WAL buffer containing the
2110  * given XLogRecPtr.
2111  *
2112  * If the page is not initialized yet, it is initialized. That might require
2113  * evicting an old dirty buffer from the buffer cache, which means I/O.
2114  *
2115  * The caller must ensure that the page containing the requested location
2116  * isn't evicted yet, and won't be evicted. The way to ensure that is to
2117  * hold onto an XLogInsertSlot with the xlogInsertingAt position set to
2118  * something <= ptr. GetXLogBuffer() will update xlogInsertingAt if it needs
2119  * to evict an old page from the buffer. (This means that once you call
2120  * GetXLogBuffer() with a given 'ptr', you must not access anything before
2121  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
2122  * later, because older buffers might be recycled already)
2123  */
2124 static char *
2125 GetXLogBuffer(XLogRecPtr ptr)
2126 {
2127         int                     idx;
2128         XLogRecPtr      endptr;
2129         static uint64 cachedPage = 0;
2130         static char *cachedPos = NULL;
2131         XLogRecPtr      expectedEndPtr;
2132
2133         /*
2134          * Fast path for the common case that we need to access again the same
2135          * page as last time.
2136          */
2137         if (ptr / XLOG_BLCKSZ == cachedPage)
2138         {
2139                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2140                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2141                 return cachedPos + ptr % XLOG_BLCKSZ;
2142         }
2143
2144         /*
2145          * The XLog buffer cache is organized so that a page is always loaded
2146          * to a particular buffer.  That way we can easily calculate the buffer
2147          * a given page must be loaded into, from the XLogRecPtr alone.
2148          */
2149         idx = XLogRecPtrToBufIdx(ptr);
2150
2151         /*
2152          * See what page is loaded in the buffer at the moment. It could be the
2153          * page we're looking for, or something older. It can't be anything newer
2154          * - that would imply the page we're looking for has already been written
2155          * out to disk and evicted, and the caller is responsible for making sure
2156          * that doesn't happen.
2157          *
2158          * However, we don't hold a lock while we read the value. If someone has
2159          * just initialized the page, it's possible that we get a "torn read" of
2160          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
2161          * that case we will see a bogus value. That's ok, we'll grab the mapping
2162          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
2163          * the page we're looking for. But it means that when we do this unlocked
2164          * read, we might see a value that appears to be ahead of the page we're
2165          * looking for. Don't PANIC on that, until we've verified the value while
2166          * holding the lock.
2167          */
2168         expectedEndPtr = ptr;
2169         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
2170
2171         endptr = XLogCtl->xlblocks[idx];
2172         if (expectedEndPtr != endptr)
2173         {
2174                 /*
2175                  * Let others know that we're finished inserting the record up
2176                  * to the page boundary.
2177                  */
2178                 WakeupWaiters(expectedEndPtr - XLOG_BLCKSZ);
2179
2180                 AdvanceXLInsertBuffer(ptr, false);
2181                 endptr = XLogCtl->xlblocks[idx];
2182
2183                 if (expectedEndPtr != endptr)
2184                         elog(PANIC, "could not find WAL buffer for %X/%X",
2185                                  (uint32) (ptr >> 32) , (uint32) ptr);
2186         }
2187         else
2188         {
2189                 /*
2190                  * Make sure the initialization of the page is visible to us, and
2191                  * won't arrive later to overwrite the WAL data we write on the page.
2192                  */
2193                 pg_memory_barrier();
2194         }
2195
2196         /*
2197          * Found the buffer holding this page. Return a pointer to the right
2198          * offset within the page.
2199          */
2200         cachedPage = ptr / XLOG_BLCKSZ;
2201         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
2202
2203         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2204         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2205
2206         return cachedPos + ptr % XLOG_BLCKSZ;
2207 }
2208
2209 /*
2210  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2211  * is the position starting from the beginning of WAL, excluding all WAL
2212  * page headers.
2213  */
2214 static XLogRecPtr
2215 XLogBytePosToRecPtr(uint64 bytepos)
2216 {
2217         uint64          fullsegs;
2218         uint64          fullpages;
2219         uint64          bytesleft;
2220         uint32          seg_offset;
2221         XLogRecPtr      result;
2222
2223         fullsegs = bytepos / UsableBytesInSegment;
2224         bytesleft = bytepos % UsableBytesInSegment;
2225
2226         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2227         {
2228                 /* fits on first page of segment */
2229                 seg_offset = bytesleft + SizeOfXLogLongPHD;
2230         }
2231         else
2232         {
2233                 /* account for the first page on segment with long header */
2234                 seg_offset = XLOG_BLCKSZ;
2235                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2236
2237                 fullpages = bytesleft / UsableBytesInPage;
2238                 bytesleft = bytesleft % UsableBytesInPage;
2239
2240                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2241         }
2242
2243         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2244
2245         return result;
2246 }
2247
2248 /*
2249  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2250  * returns a pointer to the beginning of the page (ie. before page header),
2251  * not to where the first xlog record on that page would go to. This is used
2252  * when converting a pointer to the end of a record.
2253  */
2254 static XLogRecPtr
2255 XLogBytePosToEndRecPtr(uint64 bytepos)
2256 {
2257         uint64          fullsegs;
2258         uint64          fullpages;
2259         uint64          bytesleft;
2260         uint32          seg_offset;
2261         XLogRecPtr      result;
2262
2263         fullsegs = bytepos / UsableBytesInSegment;
2264         bytesleft = bytepos % UsableBytesInSegment;
2265
2266         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2267         {
2268                 /* fits on first page of segment */
2269                 if (bytesleft == 0)
2270                         seg_offset = 0;
2271                 else
2272                         seg_offset = bytesleft + SizeOfXLogLongPHD;
2273         }
2274         else
2275         {
2276                 /* account for the first page on segment with long header */
2277                 seg_offset = XLOG_BLCKSZ;
2278                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2279
2280                 fullpages = bytesleft / UsableBytesInPage;
2281                 bytesleft = bytesleft % UsableBytesInPage;
2282
2283                 if (bytesleft == 0)
2284                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2285                 else
2286                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2287         }
2288
2289         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2290
2291         return result;
2292 }
2293
2294 /*
2295  * Convert an XLogRecPtr to a "usable byte position".
2296  */
2297 static uint64
2298 XLogRecPtrToBytePos(XLogRecPtr ptr)
2299 {
2300         uint64          fullsegs;
2301         uint32          fullpages;
2302         uint32          offset;
2303         uint64          result;
2304
2305         XLByteToSeg(ptr, fullsegs);
2306
2307         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
2308         offset = ptr % XLOG_BLCKSZ;
2309
2310         if (fullpages == 0)
2311         {
2312                 result = fullsegs * UsableBytesInSegment;
2313                 if (offset > 0)
2314                 {
2315                         Assert(offset >= SizeOfXLogLongPHD);
2316                         result += offset - SizeOfXLogLongPHD;
2317                 }
2318         }
2319         else
2320         {
2321                 result = fullsegs * UsableBytesInSegment +
2322                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) +  /* account for first page */
2323                         (fullpages - 1) * UsableBytesInPage; /* full pages */
2324                 if (offset > 0)
2325                 {
2326                         Assert(offset >= SizeOfXLogShortPHD);
2327                         result += offset - SizeOfXLogShortPHD;
2328                 }
2329         }
2330
2331         return result;
2332 }
2333
2334 /*
2335  * Determine whether the buffer referenced by an XLogRecData item has to
2336  * be backed up, and if so fill a BkpBlock struct for it.  In any case
2337  * save the buffer's LSN at *lsn.
2338  */
2339 static bool
2340 XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
2341                                 XLogRecPtr *lsn, BkpBlock *bkpb)
2342 {
2343         Page            page;
2344
2345         page = BufferGetPage(rdata->buffer);
2346
2347         /*
2348          * We assume page LSN is first data on *every* page that can be passed to
2349          * XLogInsert, whether it has the standard page layout or not. We don't
2350          * need to take the buffer header lock for PageGetLSN if we hold an
2351          * exclusive lock on the page and/or the relation.
2352          */
2353         if (holdsExclusiveLock)
2354                 *lsn = PageGetLSN(page);
2355         else
2356                 *lsn = BufferGetLSNAtomic(rdata->buffer);
2357
2358         if (*lsn <= RedoRecPtr)
2359         {
2360                 /*
2361                  * The page needs to be backed up, so set up *bkpb
2362                  */
2363                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
2364
2365                 if (rdata->buffer_std)
2366                 {
2367                         /* Assume we can omit data between pd_lower and pd_upper */
2368                         uint16          lower = ((PageHeader) page)->pd_lower;
2369                         uint16          upper = ((PageHeader) page)->pd_upper;
2370
2371                         if (lower >= SizeOfPageHeaderData &&
2372                                 upper > lower &&
2373                                 upper <= BLCKSZ)
2374                         {
2375                                 bkpb->hole_offset = lower;
2376                                 bkpb->hole_length = upper - lower;
2377                         }
2378                         else
2379                         {
2380                                 /* No "hole" to compress out */
2381                                 bkpb->hole_offset = 0;
2382                                 bkpb->hole_length = 0;
2383                         }
2384                 }
2385                 else
2386                 {
2387                         /* Not a standard page header, don't try to eliminate "hole" */
2388                         bkpb->hole_offset = 0;
2389                         bkpb->hole_length = 0;
2390                 }
2391
2392                 return true;                    /* buffer requires backup */
2393         }
2394
2395         return false;                           /* buffer does not need to be backed up */
2396 }
2397
2398 /*
2399  * Initialize XLOG buffers, writing out old buffers if they still contain
2400  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2401  * true, initialize as many pages as we can without having to write out
2402  * unwritten data. Any new pages are initialized to zeros, with pages headers
2403  * initialized properly.
2404  */
2405 static void
2406 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2407 {
2408         XLogCtlInsert *Insert = &XLogCtl->Insert;
2409         int                     nextidx;
2410         XLogRecPtr      OldPageRqstPtr;
2411         XLogwrtRqst WriteRqst;
2412         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2413         XLogRecPtr      NewPageBeginPtr;
2414         XLogPageHeader NewPage;
2415         int                     npages = 0;
2416
2417         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2418
2419         /*
2420          * Now that we have the lock, check if someone initialized the page
2421          * already.
2422          */
2423         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2424         {
2425                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2426
2427                 /*
2428                  * Get ending-offset of the buffer page we need to replace (this may
2429                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2430                  * already written out.
2431                  */
2432                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2433                 if (LogwrtResult.Write < OldPageRqstPtr)
2434                 {
2435                         /*
2436                          * Nope, got work to do. If we just want to pre-initialize as much
2437                          * as we can without flushing, give up now.
2438                          */
2439                         if (opportunistic)
2440                                 break;
2441
2442                         /* Before waiting, get info_lck and update LogwrtResult */
2443                         {
2444                                 /* use volatile pointer to prevent code rearrangement */
2445                                 volatile XLogCtlData *xlogctl = XLogCtl;
2446
2447                                 SpinLockAcquire(&xlogctl->info_lck);
2448                                 if (xlogctl->LogwrtRqst.Write < OldPageRqstPtr)
2449                                         xlogctl->LogwrtRqst.Write = OldPageRqstPtr;
2450                                 LogwrtResult = xlogctl->LogwrtResult;
2451                                 SpinLockRelease(&xlogctl->info_lck);
2452                         }
2453
2454                         /*
2455                          * Now that we have an up-to-date LogwrtResult value, see if we
2456                          * still need to write it or if someone else already did.
2457                          */
2458                         if (LogwrtResult.Write < OldPageRqstPtr)
2459                         {
2460                                 /*
2461                                  * Must acquire write lock. Release WALBufMappingLock first,
2462                                  * to make sure that all insertions that we need to wait for
2463                                  * can finish (up to this same position). Otherwise we risk
2464                                  * deadlock.
2465                                  */
2466                                 LWLockRelease(WALBufMappingLock);
2467
2468                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2469
2470                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2471
2472                                 LogwrtResult = XLogCtl->LogwrtResult;
2473                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2474                                 {
2475                                         /* OK, someone wrote it already */
2476                                         LWLockRelease(WALWriteLock);
2477                                 }
2478                                 else
2479                                 {
2480                                         /* Have to write it ourselves */
2481                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2482                                         WriteRqst.Write = OldPageRqstPtr;
2483                                         WriteRqst.Flush = 0;
2484                                         XLogWrite(WriteRqst, false);
2485                                         LWLockRelease(WALWriteLock);
2486                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2487                                 }
2488                                 /* Re-acquire WALBufMappingLock and retry */
2489                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2490                                 continue;
2491                         }
2492                 }
2493
2494                 /*
2495                  * Now the next buffer slot is free and we can set it up to be the next
2496                  * output page.
2497                  */
2498                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2499                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2500
2501                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2502
2503                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2504
2505                 /*
2506                  * Be sure to re-zero the buffer so that bytes beyond what we've
2507                  * written will look like zeroes and not valid XLOG records...
2508                  */
2509                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2510
2511                 /*
2512                  * Fill the new page's header
2513                  */
2514                 NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
2515
2516                 /* NewPage->xlp_info = 0; */    /* done by memset */
2517                 NewPage   ->xlp_tli = ThisTimeLineID;
2518                 NewPage   ->xlp_pageaddr = NewPageBeginPtr;
2519                 /* NewPage->xlp_rem_len = 0; */         /* done by memset */
2520
2521                 /*
2522                  * If online backup is not in progress, mark the header to indicate
2523                  * that* WAL records beginning in this page have removable backup
2524                  * blocks.  This allows the WAL archiver to know whether it is safe to
2525                  * compress archived WAL data by transforming full-block records into
2526                  * the non-full-block format.  It is sufficient to record this at the
2527                  * page level because we force a page switch (in fact a segment switch)
2528                  * when starting a backup, so the flag will be off before any records
2529                  * can be written during the backup.  At the end of a backup, the last
2530                  * page will be marked as all unsafe when perhaps only part is unsafe,
2531                  * but at worst the archiver would miss the opportunity to compress a
2532                  * few records.
2533                  */
2534                 if (!Insert->forcePageWrites)
2535                         NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
2536
2537                 /*
2538                  * If first page of an XLOG segment file, make it a long header.
2539                  */
2540                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2541                 {
2542                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2543
2544                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2545                         NewLongPage->xlp_seg_size = XLogSegSize;
2546                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2547                         NewPage   ->xlp_info |= XLP_LONG_HEADER;
2548                 }
2549
2550                 /*
2551                  * Make sure the initialization of the page becomes visible to others
2552                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2553                  * holding a lock.
2554                  */
2555                 pg_write_barrier();
2556
2557                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2558
2559                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2560
2561                 npages++;
2562         }
2563         LWLockRelease(WALBufMappingLock);
2564
2565 #ifdef WAL_DEBUG
2566         if (npages > 0)
2567         {
2568                 elog(DEBUG1, "initialized %d pages, upto %X/%X",
2569                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2570         }
2571 #endif
2572 }
2573
2574 /*
2575  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2576  *
2577  * new_segno indicates a log file that has just been filled up (or read
2578  * during recovery). We measure the distance from RedoRecPtr to new_segno
2579  * and see if that exceeds CheckPointSegments.
2580  *
2581  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2582  */
2583 static bool
2584 XLogCheckpointNeeded(XLogSegNo new_segno)
2585 {
2586         XLogSegNo       old_segno;
2587
2588         XLByteToSeg(RedoRecPtr, old_segno);
2589
2590         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2591                 return true;
2592         return false;
2593 }
2594
2595 /*
2596  * Write and/or fsync the log at least as far as WriteRqst indicates.
2597  *
2598  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2599  * may stop at any convenient boundary (such as a cache or logfile boundary).
2600  * This option allows us to avoid uselessly issuing multiple writes when a
2601  * single one would do.
2602  *
2603  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2604  * must be called before grabbing the lock, to make sure the data is ready to
2605  * write.
2606  */
2607 static void
2608 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2609 {
2610         bool            ispartialpage;
2611         bool            last_iteration;
2612         bool            finishing_seg;
2613         bool            use_existent;
2614         int                     curridx;
2615         int                     npages;
2616         int                     startidx;
2617         uint32          startoffset;
2618
2619         /* We should always be inside a critical section here */
2620         Assert(CritSectionCount > 0);
2621
2622         /*
2623          * Update local LogwrtResult (caller probably did this already, but...)
2624          */
2625         LogwrtResult = XLogCtl->LogwrtResult;
2626
2627         /*
2628          * Since successive pages in the xlog cache are consecutively allocated,
2629          * we can usually gather multiple pages together and issue just one
2630          * write() call.  npages is the number of pages we have determined can be
2631          * written together; startidx is the cache block index of the first one,
2632          * and startoffset is the file offset at which it should go. The latter
2633          * two variables are only valid when npages > 0, but we must initialize
2634          * all of them to keep the compiler quiet.
2635          */
2636         npages = 0;
2637         startidx = 0;
2638         startoffset = 0;
2639
2640         /*
2641          * Within the loop, curridx is the cache block index of the page to
2642          * consider writing.  Begin at the buffer containing the next unwritten
2643          * page, or last partially written page.
2644          */
2645         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2646
2647         while (LogwrtResult.Write < WriteRqst.Write)
2648         {
2649                 /*
2650                  * Make sure we're not ahead of the insert process.  This could happen
2651                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2652                  * last page that's been initialized by AdvanceXLInsertBuffer.
2653                  */
2654                 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2655                 if (LogwrtResult.Write >= EndPtr)
2656                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2657                                  (uint32) (LogwrtResult.Write >> 32),
2658                                  (uint32) LogwrtResult.Write,
2659                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2660
2661                 /* Advance LogwrtResult.Write to end of current buffer page */
2662                 LogwrtResult.Write = EndPtr;
2663                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2664
2665                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2666                 {
2667                         /*
2668                          * Switch to new logfile segment.  We cannot have any pending
2669                          * pages here (since we dump what we have at segment end).
2670                          */
2671                         Assert(npages == 0);
2672                         if (openLogFile >= 0)
2673                                 XLogFileClose();
2674                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2675
2676                         /* create/use new log file */
2677                         use_existent = true;
2678                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2679                         openLogOff = 0;
2680                 }
2681
2682                 /* Make sure we have the current logfile open */
2683                 if (openLogFile < 0)
2684                 {
2685                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2686                         openLogFile = XLogFileOpen(openLogSegNo);
2687                         openLogOff = 0;
2688                 }
2689
2690                 /* Add current page to the set of pending pages-to-dump */
2691                 if (npages == 0)
2692                 {
2693                         /* first of group */
2694                         startidx = curridx;
2695                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2696                 }
2697                 npages++;
2698
2699                 /*
2700                  * Dump the set if this will be the last loop iteration, or if we are
2701                  * at the last page of the cache area (since the next page won't be
2702                  * contiguous in memory), or if we are at the end of the logfile
2703                  * segment.
2704                  */
2705                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2706
2707                 finishing_seg = !ispartialpage &&
2708                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2709
2710                 if (last_iteration ||
2711                         curridx == XLogCtl->XLogCacheBlck ||
2712                         finishing_seg)
2713                 {
2714                         char       *from;
2715                         Size            nbytes;
2716                         Size            nleft;
2717                         int                     written;
2718
2719                         /* Need to seek in the file? */
2720                         if (openLogOff != startoffset)
2721                         {
2722                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2723                                         ereport(PANIC,
2724                                                         (errcode_for_file_access(),
2725                                          errmsg("could not seek in log file %s to offset %u: %m",
2726                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2727                                                         startoffset)));
2728                                 openLogOff = startoffset;
2729                         }
2730
2731                         /* OK to write the page(s) */
2732                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2733                         nbytes = npages * (Size) XLOG_BLCKSZ;
2734                         nleft = nbytes;
2735                         do
2736                         {
2737                                 errno = 0;
2738                                 written  = write(openLogFile, from, nleft);
2739                                 if (written <= 0)
2740                                 {
2741                                         if (errno == EINTR)
2742                                                 continue;
2743                                         ereport(PANIC,
2744                                                         (errcode_for_file_access(),
2745                                                          errmsg("could not write to log file %s "
2746                                                                         "at offset %u, length %zu: %m",
2747                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2748                                                                         openLogOff, nbytes)));
2749                                 }
2750                                 nleft -= written;
2751                                 from += written;
2752                         } while (nleft > 0);
2753
2754                         /* Update state for write */
2755                         openLogOff += nbytes;
2756                         npages = 0;
2757
2758                         /*
2759                          * If we just wrote the whole last page of a logfile segment,
2760                          * fsync the segment immediately.  This avoids having to go back
2761                          * and re-open prior segments when an fsync request comes along
2762                          * later. Doing it here ensures that one and only one backend will
2763                          * perform this fsync.
2764                          *
2765                          * This is also the right place to notify the Archiver that the
2766                          * segment is ready to copy to archival storage, and to update the
2767                          * timer for archive_timeout, and to signal for a checkpoint if
2768                          * too many logfile segments have been used since the last
2769                          * checkpoint.
2770                          */
2771                         if (finishing_seg)
2772                         {
2773                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2774
2775                                 /* signal that we need to wakeup walsenders later */
2776                                 WalSndWakeupRequest();
2777
2778                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2779
2780                                 if (XLogArchivingActive())
2781                                         XLogArchiveNotifySeg(openLogSegNo);
2782
2783                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2784
2785                                 /*
2786                                  * Request a checkpoint if we've consumed too much xlog since
2787                                  * the last one.  For speed, we first check using the local
2788                                  * copy of RedoRecPtr, which might be out of date; if it looks
2789                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2790                                  * recheck.
2791                                  */
2792                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2793                                 {
2794                                         (void) GetRedoRecPtr();
2795                                         if (XLogCheckpointNeeded(openLogSegNo))
2796                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2797                                 }
2798                         }
2799                 }
2800
2801                 if (ispartialpage)
2802                 {
2803                         /* Only asked to write a partial page */
2804                         LogwrtResult.Write = WriteRqst.Write;
2805                         break;
2806                 }
2807                 curridx = NextBufIdx(curridx);
2808
2809                 /* If flexible, break out of loop as soon as we wrote something */
2810                 if (flexible && npages == 0)
2811                         break;
2812         }
2813
2814         Assert(npages == 0);
2815
2816         /*
2817          * If asked to flush, do so
2818          */
2819         if (LogwrtResult.Flush < WriteRqst.Flush &&
2820                 LogwrtResult.Flush < LogwrtResult.Write)
2821
2822         {
2823                 /*
2824                  * Could get here without iterating above loop, in which case we might
2825                  * have no open file or the wrong one.  However, we do not need to
2826                  * fsync more than one file.
2827                  */
2828                 if (sync_method != SYNC_METHOD_OPEN &&
2829                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2830                 {
2831                         if (openLogFile >= 0 &&
2832                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2833                                 XLogFileClose();
2834                         if (openLogFile < 0)
2835                         {
2836                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2837                                 openLogFile = XLogFileOpen(openLogSegNo);
2838                                 openLogOff = 0;
2839                         }
2840
2841                         issue_xlog_fsync(openLogFile, openLogSegNo);
2842                 }
2843
2844                 /* signal that we need to wakeup walsenders later */
2845                 WalSndWakeupRequest();
2846
2847                 LogwrtResult.Flush = LogwrtResult.Write;
2848         }
2849
2850         /*
2851          * Update shared-memory status
2852          *
2853          * We make sure that the shared 'request' values do not fall behind the
2854          * 'result' values.  This is not absolutely essential, but it saves some
2855          * code in a couple of places.
2856          */
2857         {
2858                 /* use volatile pointer to prevent code rearrangement */
2859                 volatile XLogCtlData *xlogctl = XLogCtl;
2860
2861                 SpinLockAcquire(&xlogctl->info_lck);
2862                 xlogctl->LogwrtResult = LogwrtResult;
2863                 if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
2864                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
2865                 if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
2866                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
2867                 SpinLockRelease(&xlogctl->info_lck);
2868         }
2869 }
2870
2871 /*
2872  * Record the LSN for an asynchronous transaction commit/abort
2873  * and nudge the WALWriter if there is work for it to do.
2874  * (This should not be called for synchronous commits.)
2875  */
2876 void
2877 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2878 {
2879         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2880         bool            sleeping;
2881
2882         /* use volatile pointer to prevent code rearrangement */
2883         volatile XLogCtlData *xlogctl = XLogCtl;
2884
2885         SpinLockAcquire(&xlogctl->info_lck);
2886         LogwrtResult = xlogctl->LogwrtResult;
2887         sleeping = xlogctl->WalWriterSleeping;
2888         if (xlogctl->asyncXactLSN < asyncXactLSN)
2889                 xlogctl->asyncXactLSN = asyncXactLSN;
2890         SpinLockRelease(&xlogctl->info_lck);
2891
2892         /*
2893          * If the WALWriter is sleeping, we should kick it to make it come out of
2894          * low-power mode.      Otherwise, determine whether there's a full page of
2895          * WAL available to write.
2896          */
2897         if (!sleeping)
2898         {
2899                 /* back off to last completed page boundary */
2900                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2901
2902                 /* if we have already flushed that far, we're done */
2903                 if (WriteRqstPtr <= LogwrtResult.Flush)
2904                         return;
2905         }
2906
2907         /*
2908          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2909          * to come out of low-power mode so that this async commit will reach disk
2910          * within the expected amount of time.
2911          */
2912         if (ProcGlobal->walwriterLatch)
2913                 SetLatch(ProcGlobal->walwriterLatch);
2914 }
2915
2916 /*
2917  * Record the LSN up to which we can remove WAL because it's not required by
2918  * any replication slot.
2919  */
2920 void
2921 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2922 {
2923         /* use volatile pointer to prevent code rearrangement */
2924         volatile XLogCtlData *xlogctl = XLogCtl;
2925
2926         SpinLockAcquire(&xlogctl->info_lck);
2927         xlogctl->replicationSlotMinLSN = lsn;
2928         SpinLockRelease(&xlogctl->info_lck);
2929 }
2930
2931
2932 /*
2933  * Return the oldest LSN we must retain to satisfy the needs of some
2934  * replication slot.
2935  */
2936 static XLogRecPtr
2937 XLogGetReplicationSlotMinimumLSN(void)
2938 {
2939         /* use volatile pointer to prevent code rearrangement */
2940         volatile XLogCtlData *xlogctl = XLogCtl;
2941         XLogRecPtr              retval;
2942         SpinLockAcquire(&xlogctl->info_lck);
2943         retval = xlogctl->replicationSlotMinLSN;
2944         SpinLockRelease(&xlogctl->info_lck);
2945
2946         return retval;
2947 }
2948
2949 /*
2950  * Advance minRecoveryPoint in control file.
2951  *
2952  * If we crash during recovery, we must reach this point again before the
2953  * database is consistent.
2954  *
2955  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2956  * is only updated if it's not already greater than or equal to 'lsn'.
2957  */
2958 static void
2959 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2960 {
2961         /* Quick check using our local copy of the variable */
2962         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2963                 return;
2964
2965         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2966
2967         /* update local copy */
2968         minRecoveryPoint = ControlFile->minRecoveryPoint;
2969         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2970
2971         /*
2972          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2973          * i.e., we're doing crash recovery.  We never modify the control file's
2974          * value in that case, so we can short-circuit future checks here too.
2975          */
2976         if (minRecoveryPoint == 0)
2977                 updateMinRecoveryPoint = false;
2978         else if (force || minRecoveryPoint < lsn)
2979         {
2980                 /* use volatile pointer to prevent code rearrangement */
2981                 volatile XLogCtlData *xlogctl = XLogCtl;
2982                 XLogRecPtr      newMinRecoveryPoint;
2983                 TimeLineID      newMinRecoveryPointTLI;
2984
2985                 /*
2986                  * To avoid having to update the control file too often, we update it
2987                  * all the way to the last record being replayed, even though 'lsn'
2988                  * would suffice for correctness.  This also allows the 'force' case
2989                  * to not need a valid 'lsn' value.
2990                  *
2991                  * Another important reason for doing it this way is that the passed
2992                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2993                  * the caller got it from a corrupted heap page.  Accepting such a
2994                  * value as the min recovery point would prevent us from coming up at
2995                  * all.  Instead, we just log a warning and continue with recovery.
2996                  * (See also the comments about corrupt LSNs in XLogFlush.)
2997                  */
2998                 SpinLockAcquire(&xlogctl->info_lck);
2999                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
3000                 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
3001                 SpinLockRelease(&xlogctl->info_lck);
3002
3003                 if (!force && newMinRecoveryPoint < lsn)
3004                         elog(WARNING,
3005                            "xlog min recovery request %X/%X is past current point %X/%X",
3006                                  (uint32) (lsn >> 32), (uint32) lsn,
3007                                  (uint32) (newMinRecoveryPoint >> 32),
3008                                  (uint32) newMinRecoveryPoint);
3009
3010                 /* update control file */
3011                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
3012                 {
3013                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
3014                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
3015                         UpdateControlFile();
3016                         minRecoveryPoint = newMinRecoveryPoint;
3017                         minRecoveryPointTLI = newMinRecoveryPointTLI;
3018
3019                         ereport(DEBUG2,
3020                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
3021                                                 (uint32) (minRecoveryPoint >> 32),
3022                                                 (uint32) minRecoveryPoint,
3023                                                 newMinRecoveryPointTLI)));
3024                 }
3025         }
3026         LWLockRelease(ControlFileLock);
3027 }
3028
3029 /*
3030  * Ensure that all XLOG data through the given position is flushed to disk.
3031  *
3032  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
3033  * already held, and we try to avoid acquiring it if possible.
3034  */
3035 void
3036 XLogFlush(XLogRecPtr record)
3037 {
3038         XLogRecPtr      WriteRqstPtr;
3039         XLogwrtRqst WriteRqst;
3040
3041         /*
3042          * During REDO, we are reading not writing WAL.  Therefore, instead of
3043          * trying to flush the WAL, we should update minRecoveryPoint instead. We
3044          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
3045          * to act this way too, and because when it tries to write the
3046          * end-of-recovery checkpoint, it should indeed flush.
3047          */
3048         if (!XLogInsertAllowed())
3049         {
3050                 UpdateMinRecoveryPoint(record, false);
3051                 return;
3052         }
3053
3054         /* Quick exit if already known flushed */
3055         if (record <= LogwrtResult.Flush)
3056                 return;
3057
3058 #ifdef WAL_DEBUG
3059         if (XLOG_DEBUG)
3060                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
3061                          (uint32) (record >> 32), (uint32) record,
3062                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3063                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3064 #endif
3065
3066         START_CRIT_SECTION();
3067
3068         /*
3069          * Since fsync is usually a horribly expensive operation, we try to
3070          * piggyback as much data as we can on each fsync: if we see any more data
3071          * entered into the xlog buffer, we'll write and fsync that too, so that
3072          * the final value of LogwrtResult.Flush is as large as possible. This
3073          * gives us some chance of avoiding another fsync immediately after.
3074          */
3075
3076         /* initialize to given target; may increase below */
3077         WriteRqstPtr = record;
3078
3079         /*
3080          * Now wait until we get the write lock, or someone else does the flush
3081          * for us.
3082          */
3083         for (;;)
3084         {
3085                 /* use volatile pointer to prevent code rearrangement */
3086                 volatile XLogCtlData *xlogctl = XLogCtl;
3087                 XLogRecPtr      insertpos;
3088
3089                 /* read LogwrtResult and update local state */
3090                 SpinLockAcquire(&xlogctl->info_lck);
3091                 if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
3092                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3093                 LogwrtResult = xlogctl->LogwrtResult;
3094                 SpinLockRelease(&xlogctl->info_lck);
3095
3096                 /* done already? */
3097                 if (record <= LogwrtResult.Flush)
3098                         break;
3099
3100                 /*
3101                  * Before actually performing the write, wait for all in-flight
3102                  * insertions to the pages we're about to write to finish.
3103                  */
3104                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
3105
3106                 /*
3107                  * Try to get the write lock. If we can't get it immediately, wait
3108                  * until it's released, and recheck if we still need to do the flush
3109                  * or if the backend that held the lock did it for us already. This
3110                  * helps to maintain a good rate of group committing when the system
3111                  * is bottlenecked by the speed of fsyncing.
3112                  */
3113                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
3114                 {
3115                         /*
3116                          * The lock is now free, but we didn't acquire it yet. Before we
3117                          * do, loop back to check if someone else flushed the record for
3118                          * us already.
3119                          */
3120                         continue;
3121                 }
3122
3123                 /* Got the lock; recheck whether request is satisfied */
3124                 LogwrtResult = XLogCtl->LogwrtResult;
3125                 if (record <= LogwrtResult.Flush)
3126                 {
3127                         LWLockRelease(WALWriteLock);
3128                         break;
3129                 }
3130
3131                 /*
3132                  * Sleep before flush! By adding a delay here, we may give further
3133                  * backends the opportunity to join the backlog of group commit
3134                  * followers; this can significantly improve transaction throughput,
3135                  * at the risk of increasing transaction latency.
3136                  *
3137                  * We do not sleep if enableFsync is not turned on, nor if there are
3138                  * fewer than CommitSiblings other backends with active transactions.
3139                  */
3140                 if (CommitDelay > 0 && enableFsync &&
3141                         MinimumActiveBackends(CommitSiblings))
3142                 {
3143                         pg_usleep(CommitDelay);
3144
3145                         /*
3146                          * Re-check how far we can now flush the WAL. It's generally not
3147                          * safe to call WaitXLogInsetionsToFinish while holding
3148                          * WALWriteLock, because an in-progress insertion might need to
3149                          * also grab WALWriteLock to make progress. But we know that all
3150                          * the insertions up to insertpos have already finished, because
3151                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
3152                          * We're only calling it again to allow insertpos to be moved
3153                          * further forward, not to actually wait for anyone.
3154                          */
3155                         insertpos = WaitXLogInsertionsToFinish(insertpos);
3156                 }
3157
3158                 /* try to write/flush later additions to XLOG as well */
3159                 WriteRqst.Write = insertpos;
3160                 WriteRqst.Flush = insertpos;
3161
3162                 XLogWrite(WriteRqst, false);
3163
3164                 LWLockRelease(WALWriteLock);
3165                 /* done */
3166                 break;
3167         }
3168
3169         END_CRIT_SECTION();
3170
3171         /* wake up walsenders now that we've released heavily contended locks */
3172         WalSndWakeupProcessRequests();
3173
3174         /*
3175          * If we still haven't flushed to the request point then we have a
3176          * problem; most likely, the requested flush point is past end of XLOG.
3177          * This has been seen to occur when a disk page has a corrupted LSN.
3178          *
3179          * Formerly we treated this as a PANIC condition, but that hurts the
3180          * system's robustness rather than helping it: we do not want to take down
3181          * the whole system due to corruption on one data page.  In particular, if
3182          * the bad page is encountered again during recovery then we would be
3183          * unable to restart the database at all!  (This scenario actually
3184          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
3185          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3186          * the only time we can reach here during recovery is while flushing the
3187          * end-of-recovery checkpoint record, and we don't expect that to have a
3188          * bad LSN.
3189          *
3190          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3191          * since xact.c calls this routine inside a critical section.  However,
3192          * calls from bufmgr.c are not within critical sections and so we will not
3193          * force a restart for a bad LSN on a data page.
3194          */
3195         if (LogwrtResult.Flush < record)
3196                 elog(ERROR,
3197                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3198                          (uint32) (record >> 32), (uint32) record,
3199                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3200 }
3201
3202 /*
3203  * Flush xlog, but without specifying exactly where to flush to.
3204  *
3205  * We normally flush only completed blocks; but if there is nothing to do on
3206  * that basis, we check for unflushed async commits in the current incomplete
3207  * block, and flush through the latest one of those.  Thus, if async commits
3208  * are not being used, we will flush complete blocks only.      We can guarantee
3209  * that async commits reach disk after at most three cycles; normally only
3210  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
3211  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
3212  * difference only with very high load or long wal_writer_delay, but imposes
3213  * one extra cycle for the worst case for async commits.)
3214  *
3215  * This routine is invoked periodically by the background walwriter process.
3216  *
3217  * Returns TRUE if we flushed anything.
3218  */
3219 bool
3220 XLogBackgroundFlush(void)
3221 {
3222         XLogRecPtr      WriteRqstPtr;
3223         bool            flexible = true;
3224         bool            wrote_something = false;
3225
3226         /* XLOG doesn't need flushing during recovery */
3227         if (RecoveryInProgress())
3228                 return false;
3229
3230         /* read LogwrtResult and update local state */
3231         {
3232                 /* use volatile pointer to prevent code rearrangement */
3233                 volatile XLogCtlData *xlogctl = XLogCtl;
3234
3235                 SpinLockAcquire(&xlogctl->info_lck);
3236                 LogwrtResult = xlogctl->LogwrtResult;
3237                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3238                 SpinLockRelease(&xlogctl->info_lck);
3239         }
3240
3241         /* back off to last completed page boundary */
3242         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
3243
3244         /* if we have already flushed that far, consider async commit records */
3245         if (WriteRqstPtr <= LogwrtResult.Flush)
3246         {
3247                 /* use volatile pointer to prevent code rearrangement */
3248                 volatile XLogCtlData *xlogctl = XLogCtl;
3249
3250                 SpinLockAcquire(&xlogctl->info_lck);
3251                 WriteRqstPtr = xlogctl->asyncXactLSN;
3252                 SpinLockRelease(&xlogctl->info_lck);
3253                 flexible = false;               /* ensure it all gets written */
3254         }
3255
3256         /*
3257          * If already known flushed, we're done. Just need to check if we are
3258          * holding an open file handle to a logfile that's no longer in use,
3259          * preventing the file from being deleted.
3260          */
3261         if (WriteRqstPtr <= LogwrtResult.Flush)
3262         {
3263                 if (openLogFile >= 0)
3264                 {
3265                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
3266                         {
3267                                 XLogFileClose();
3268                         }
3269                 }
3270                 return false;
3271         }
3272
3273 #ifdef WAL_DEBUG
3274         if (XLOG_DEBUG)
3275                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
3276                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
3277                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3278                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3279 #endif
3280
3281         START_CRIT_SECTION();
3282
3283         /* now wait for any in-progress insertions to finish and get write lock */
3284         WaitXLogInsertionsToFinish(WriteRqstPtr);
3285         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3286         LogwrtResult = XLogCtl->LogwrtResult;
3287         if (WriteRqstPtr > LogwrtResult.Flush)
3288         {
3289                 XLogwrtRqst WriteRqst;
3290
3291                 WriteRqst.Write = WriteRqstPtr;
3292                 WriteRqst.Flush = WriteRqstPtr;
3293                 XLogWrite(WriteRqst, flexible);
3294                 wrote_something = true;
3295         }
3296         LWLockRelease(WALWriteLock);
3297
3298         END_CRIT_SECTION();
3299
3300         /* wake up walsenders now that we've released heavily contended locks */
3301         WalSndWakeupProcessRequests();
3302
3303         /*
3304          * Great, done. To take some work off the critical path, try to initialize
3305          * as many of the no-longer-needed WAL buffers for future use as we can.
3306          */
3307         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3308
3309         return wrote_something;
3310 }
3311
3312 /*
3313  * Test whether XLOG data has been flushed up to (at least) the given position.
3314  *
3315  * Returns true if a flush is still needed.  (It may be that someone else
3316  * is already in process of flushing that far, however.)
3317  */
3318 bool
3319 XLogNeedsFlush(XLogRecPtr record)
3320 {
3321         /*
3322          * During recovery, we don't flush WAL but update minRecoveryPoint
3323          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3324          * would need to be updated.
3325          */
3326         if (RecoveryInProgress())
3327         {
3328                 /* Quick exit if already known updated */
3329                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3330                         return false;
3331
3332                 /*
3333                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3334                  * just return a conservative guess.
3335                  */
3336                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3337                         return true;
3338                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3339                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3340                 LWLockRelease(ControlFileLock);
3341
3342                 /*
3343                  * An invalid minRecoveryPoint means that we need to recover all the
3344                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3345                  * file's value in that case, so we can short-circuit future checks
3346                  * here too.
3347                  */
3348                 if (minRecoveryPoint == 0)
3349                         updateMinRecoveryPoint = false;
3350
3351                 /* check again */
3352                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3353                         return false;
3354                 else
3355                         return true;
3356         }
3357
3358         /* Quick exit if already known flushed */
3359         if (record <= LogwrtResult.Flush)
3360                 return false;
3361
3362         /* read LogwrtResult and update local state */
3363         {
3364                 /* use volatile pointer to prevent code rearrangement */
3365                 volatile XLogCtlData *xlogctl = XLogCtl;
3366
3367                 SpinLockAcquire(&xlogctl->info_lck);
3368                 LogwrtResult = xlogctl->LogwrtResult;
3369                 SpinLockRelease(&xlogctl->info_lck);
3370         }
3371
3372         /* check again */
3373         if (record <= LogwrtResult.Flush)
3374                 return false;
3375
3376         return true;
3377 }
3378
3379 /*
3380  * Create a new XLOG file segment, or open a pre-existing one.
3381  *
3382  * log, seg: identify segment to be created/opened.
3383  *
3384  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3385  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
3386  * file was used.
3387  *
3388  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3389  * place.  This should be TRUE except during bootstrap log creation.  The
3390  * caller must *not* hold the lock at call.
3391  *
3392  * Returns FD of opened file.
3393  *
3394  * Note: errors here are ERROR not PANIC because we might or might not be
3395  * inside a critical section (eg, during checkpoint there is no reason to
3396  * take down the system on failure).  They will promote to PANIC if we are
3397  * in a critical section.
3398  */
3399 int
3400 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3401 {
3402         char            path[MAXPGPATH];
3403         char            tmppath[MAXPGPATH];
3404         char       *zbuffer;
3405         XLogSegNo       installed_segno;
3406         int                     max_advance;
3407         int                     fd;
3408         int                     nbytes;
3409
3410         XLogFilePath(path, ThisTimeLineID, logsegno);
3411
3412         /*
3413          * Try to use existent file (checkpoint maker may have created it already)
3414          */
3415         if (*use_existent)
3416         {
3417                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3418                                                    S_IRUSR | S_IWUSR);
3419                 if (fd < 0)
3420                 {
3421                         if (errno != ENOENT)
3422                                 ereport(ERROR,
3423                                                 (errcode_for_file_access(),
3424                                                  errmsg("could not open file \"%s\": %m", path)));
3425                 }
3426                 else
3427                         return fd;
3428         }
3429
3430         /*
3431          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3432          * another process is doing the same thing.  If so, we will end up
3433          * pre-creating an extra log segment.  That seems OK, and better than
3434          * holding the lock throughout this lengthy process.
3435          */
3436         elog(DEBUG2, "creating and filling new WAL file");
3437
3438         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3439
3440         unlink(tmppath);
3441
3442         /*
3443          * Allocate a buffer full of zeros. This is done before opening the file
3444          * so that we don't leak the file descriptor if palloc fails.
3445          *
3446          * Note: palloc zbuffer, instead of just using a local char array, to
3447          * ensure it is reasonably well-aligned; this may save a few cycles
3448          * transferring data to the kernel.
3449          */
3450         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
3451
3452         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3453         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3454                                            S_IRUSR | S_IWUSR);
3455         if (fd < 0)
3456                 ereport(ERROR,
3457                                 (errcode_for_file_access(),
3458                                  errmsg("could not create file \"%s\": %m", tmppath)));
3459
3460         /*
3461          * Zero-fill the file.  We have to do this the hard way to ensure that all
3462          * the file space has really been allocated --- on platforms that allow
3463          * "holes" in files, just seeking to the end doesn't allocate intermediate
3464          * space.  This way, we know that we have all the space and (after the
3465          * fsync below) that all the indirect blocks are down on disk.  Therefore,
3466          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3467          * log file.
3468          */
3469         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3470         {
3471                 errno = 0;
3472                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3473                 {
3474                         int                     save_errno = errno;
3475
3476                         /*
3477                          * If we fail to make the file, delete it to release disk space
3478                          */
3479                         unlink(tmppath);
3480
3481                         close(fd);
3482
3483                         /* if write didn't set errno, assume problem is no disk space */
3484                         errno = save_errno ? save_errno : ENOSPC;
3485
3486                         ereport(ERROR,
3487                                         (errcode_for_file_access(),
3488                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3489                 }
3490         }
3491         pfree(zbuffer);
3492
3493         if (pg_fsync(fd) != 0)
3494         {
3495                 close(fd);
3496                 ereport(ERROR,
3497                                 (errcode_for_file_access(),
3498                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3499         }
3500
3501         if (close(fd))
3502                 ereport(ERROR,
3503                                 (errcode_for_file_access(),
3504                                  errmsg("could not close file \"%s\": %m", tmppath)));
3505
3506         /*
3507          * Now move the segment into place with its final name.
3508          *
3509          * If caller didn't want to use a pre-existing file, get rid of any
3510          * pre-existing file.  Otherwise, cope with possibility that someone else
3511          * has created the file while we were filling ours: if so, use ours to
3512          * pre-create a future log segment.
3513          */
3514         installed_segno = logsegno;
3515         max_advance = XLOGfileslop;
3516         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3517                                                                 *use_existent, &max_advance,
3518                                                                 use_lock))
3519         {
3520                 /*
3521                  * No need for any more future segments, or InstallXLogFileSegment()
3522                  * failed to rename the file into place. If the rename failed, opening
3523                  * the file below will fail.
3524                  */
3525                 unlink(tmppath);
3526         }
3527
3528         /* Set flag to tell caller there was no existent file */
3529         *use_existent = false;
3530
3531         /* Now open original target segment (might not be file I just made) */
3532         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3533                                            S_IRUSR | S_IWUSR);
3534         if (fd < 0)
3535                 ereport(ERROR,
3536                                 (errcode_for_file_access(),
3537                                  errmsg("could not open file \"%s\": %m", path)));
3538
3539         elog(DEBUG2, "done creating and filling new WAL file");
3540
3541         return fd;
3542 }
3543
3544 /*
3545  * Create a new XLOG file segment by copying a pre-existing one.
3546  *
3547  * destsegno: identify segment to be created.
3548  *
3549  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
3550  *              a different timeline)
3551  *
3552  * Currently this is only used during recovery, and so there are no locking
3553  * considerations.      But we should be just as tense as XLogFileInit to avoid
3554  * emplacing a bogus file.
3555  */
3556 static void
3557 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
3558 {
3559         char            path[MAXPGPATH];
3560         char            tmppath[MAXPGPATH];
3561         char            buffer[XLOG_BLCKSZ];
3562         int                     srcfd;
3563         int                     fd;
3564         int                     nbytes;
3565
3566         /*
3567          * Open the source file
3568          */
3569         XLogFilePath(path, srcTLI, srcsegno);
3570         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3571         if (srcfd < 0)
3572                 ereport(ERROR,
3573                                 (errcode_for_file_access(),
3574                                  errmsg("could not open file \"%s\": %m", path)));
3575
3576         /*
3577          * Copy into a temp file name.
3578          */
3579         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3580
3581         unlink(tmppath);
3582
3583         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3584         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3585                                                    S_IRUSR | S_IWUSR);
3586         if (fd < 0)
3587                 ereport(ERROR,
3588                                 (errcode_for_file_access(),
3589                                  errmsg("could not create file \"%s\": %m", tmppath)));
3590
3591         /*
3592          * Do the data copying.
3593          */
3594         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3595         {
3596                 errno = 0;
3597                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3598                 {
3599                         if (errno != 0)
3600                                 ereport(ERROR,
3601                                                 (errcode_for_file_access(),
3602                                                  errmsg("could not read file \"%s\": %m", path)));
3603                         else
3604                                 ereport(ERROR,
3605                                                 (errmsg("not enough data in file \"%s\"", path)));
3606                 }
3607                 errno = 0;
3608                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3609                 {
3610                         int                     save_errno = errno;
3611
3612                         /*
3613                          * If we fail to make the file, delete it to release disk space
3614                          */
3615                         unlink(tmppath);
3616                         /* if write didn't set errno, assume problem is no disk space */
3617                         errno = save_errno ? save_errno : ENOSPC;
3618
3619                         ereport(ERROR,
3620                                         (errcode_for_file_access(),
3621                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3622                 }
3623         }
3624
3625         if (pg_fsync(fd) != 0)
3626                 ereport(ERROR,
3627                                 (errcode_for_file_access(),
3628                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3629
3630         if (CloseTransientFile(fd))
3631                 ereport(ERROR,
3632                                 (errcode_for_file_access(),
3633                                  errmsg("could not close file \"%s\": %m", tmppath)));
3634
3635         CloseTransientFile(srcfd);
3636
3637         /*
3638          * Now move the segment into place with its final name.
3639          */
3640         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
3641                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3642 }
3643
3644 /*
3645  * Install a new XLOG segment file as a current or future log segment.
3646  *
3647  * This is used both to install a newly-created segment (which has a temp
3648  * filename while it's being created) and to recycle an old segment.
3649  *
3650  * *segno: identify segment to install as (or first possible target).
3651  * When find_free is TRUE, this is modified on return to indicate the
3652  * actual installation location or last segment searched.
3653  *
3654  * tmppath: initial name of file to install.  It will be renamed into place.
3655  *
3656  * find_free: if TRUE, install the new segment at the first empty segno
3657  * number at or after the passed numbers.  If FALSE, install the new segment
3658  * exactly where specified, deleting any existing segment file there.
3659  *
3660  * *max_advance: maximum number of segno slots to advance past the starting
3661  * point.  Fail if no free slot is found in this range.  On return, reduced
3662  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
3663  * when find_free is FALSE.)
3664  *
3665  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3666  * place.  This should be TRUE except during bootstrap log creation.  The
3667  * caller must *not* hold the lock at call.
3668  *
3669  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3670  * max_advance limit was exceeded, or an error occurred while renaming the
3671  * file into place.
3672  */
3673 static bool
3674 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3675                                            bool find_free, int *max_advance,
3676                                            bool use_lock)
3677 {
3678         char            path[MAXPGPATH];
3679         struct stat stat_buf;
3680
3681         XLogFilePath(path, ThisTimeLineID, *segno);
3682
3683         /*
3684          * We want to be sure that only one process does this at a time.
3685          */
3686         if (use_lock)
3687                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3688
3689         if (!find_free)
3690         {
3691                 /* Force installation: get rid of any pre-existing segment file */
3692                 unlink(path);
3693         }
3694         else
3695         {
3696                 /* Find a free slot to put it in */
3697                 while (stat(path, &stat_buf) == 0)
3698                 {
3699                         if (*max_advance <= 0)
3700                         {
3701                                 /* Failed to find a free slot within specified range */
3702                                 if (use_lock)
3703                                         LWLockRelease(ControlFileLock);
3704                                 return false;
3705                         }
3706                         (*segno)++;
3707                         (*max_advance)--;
3708                         XLogFilePath(path, ThisTimeLineID, *segno);
3709                 }
3710         }
3711
3712         /*
3713          * Prefer link() to rename() here just to be really sure that we don't
3714          * overwrite an existing logfile.  However, there shouldn't be one, so
3715          * rename() is an acceptable substitute except for the truly paranoid.
3716          */
3717 #if HAVE_WORKING_LINK
3718         if (link(tmppath, path) < 0)
3719         {
3720                 if (use_lock)
3721                         LWLockRelease(ControlFileLock);
3722                 ereport(LOG,
3723                                 (errcode_for_file_access(),
3724                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
3725                                                 tmppath, path)));
3726                 return false;
3727         }
3728         unlink(tmppath);
3729 #else
3730         if (rename(tmppath, path) < 0)
3731         {
3732                 if (use_lock)
3733                         LWLockRelease(ControlFileLock);
3734                 ereport(LOG,
3735                                 (errcode_for_file_access(),
3736                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
3737                                                 tmppath, path)));
3738                 return false;
3739         }
3740 #endif
3741
3742         if (use_lock)
3743                 LWLockRelease(ControlFileLock);
3744
3745         return true;
3746 }
3747
3748 /*
3749  * Open a pre-existing logfile segment for writing.
3750  */
3751 int
3752 XLogFileOpen(XLogSegNo segno)
3753 {
3754         char            path[MAXPGPATH];
3755         int                     fd;
3756
3757         XLogFilePath(path, ThisTimeLineID, segno);
3758
3759         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3760                                            S_IRUSR | S_IWUSR);
3761         if (fd < 0)
3762                 ereport(PANIC,
3763                                 (errcode_for_file_access(),
3764                                  errmsg("could not open transaction log file \"%s\": %m", path)));
3765
3766         return fd;
3767 }
3768
3769 /*
3770  * Open a logfile segment for reading (during recovery).
3771  *
3772  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3773  * Otherwise, it's assumed to be already available in pg_xlog.
3774  */
3775 static int
3776 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3777                          int source, bool notfoundOk)
3778 {
3779         char            xlogfname[MAXFNAMELEN];
3780         char            activitymsg[MAXFNAMELEN + 16];
3781         char            path[MAXPGPATH];
3782         int                     fd;
3783
3784         XLogFileName(xlogfname, tli, segno);
3785
3786         switch (source)
3787         {
3788                 case XLOG_FROM_ARCHIVE:
3789                         /* Report recovery progress in PS display */
3790                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3791                                          xlogfname);
3792                         set_ps_display(activitymsg, false);
3793
3794                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3795                                                                                                           "RECOVERYXLOG",
3796                                                                                                           XLogSegSize,
3797                                                                                                           InRedo);
3798                         if (!restoredFromArchive)
3799                                 return -1;
3800                         break;
3801
3802                 case XLOG_FROM_PG_XLOG:
3803                 case XLOG_FROM_STREAM:
3804                         XLogFilePath(path, tli, segno);
3805                         restoredFromArchive = false;
3806                         break;
3807
3808                 default:
3809                         elog(ERROR, "invalid XLogFileRead source %d", source);
3810         }
3811
3812         /*
3813          * If the segment was fetched from archival storage, replace the existing
3814          * xlog segment (if any) with the archival version.
3815          */
3816         if (source == XLOG_FROM_ARCHIVE)
3817         {
3818                 KeepFileRestoredFromArchive(path, xlogfname);
3819
3820                 /*
3821                  * Set path to point at the new file in pg_xlog.
3822                  */
3823                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3824         }
3825
3826         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3827         if (fd >= 0)
3828         {
3829                 /* Success! */
3830                 curFileTLI = tli;
3831
3832                 /* Report recovery progress in PS display */
3833                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3834                                  xlogfname);
3835                 set_ps_display(activitymsg, false);
3836
3837                 /* Track source of data in assorted state variables */
3838                 readSource = source;
3839                 XLogReceiptSource = source;
3840                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3841                 if (source != XLOG_FROM_STREAM)
3842                         XLogReceiptTime = GetCurrentTimestamp();
3843
3844                 return fd;
3845         }
3846         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3847                 ereport(PANIC,
3848                                 (errcode_for_file_access(),
3849                                  errmsg("could not open file \"%s\": %m", path)));
3850         return -1;
3851 }
3852
3853 /*
3854  * Open a logfile segment for reading (during recovery).
3855  *
3856  * This version searches for the segment with any TLI listed in expectedTLEs.
3857  */
3858 static int
3859 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3860 {
3861         char            path[MAXPGPATH];
3862         ListCell   *cell;
3863         int                     fd;
3864         List       *tles;
3865
3866         /*
3867          * Loop looking for a suitable timeline ID: we might need to read any of
3868          * the timelines listed in expectedTLEs.
3869          *
3870          * We expect curFileTLI on entry to be the TLI of the preceding file in
3871          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3872          * to go backwards; this prevents us from picking up the wrong file when a
3873          * parent timeline extends to higher segment numbers than the child we
3874          * want to read.
3875          *
3876          * If we haven't read the timeline history file yet, read it now, so that
3877          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3878          * however, unless we actually find a valid segment.  That way if there is
3879          * neither a timeline history file nor a WAL segment in the archive, and
3880          * streaming replication is set up, we'll read the timeline history file
3881          * streamed from the master when we start streaming, instead of recovering
3882          * with a dummy history generated here.
3883          */
3884         if (expectedTLEs)
3885                 tles = expectedTLEs;
3886         else
3887                 tles = readTimeLineHistory(recoveryTargetTLI);
3888
3889         foreach(cell, tles)
3890         {
3891                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3892
3893                 if (tli < curFileTLI)
3894                         break;                          /* don't bother looking at too-old TLIs */
3895
3896                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3897                 {
3898                         fd = XLogFileRead(segno, emode, tli,
3899                                                           XLOG_FROM_ARCHIVE, true);
3900                         if (fd != -1)
3901                         {
3902                                 elog(DEBUG1, "got WAL segment from archive");
3903                                 if (!expectedTLEs)
3904                                         expectedTLEs = tles;
3905                                 return fd;
3906                         }
3907                 }
3908
3909                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3910                 {
3911                         fd = XLogFileRead(segno, emode, tli,
3912                                                           XLOG_FROM_PG_XLOG, true);
3913                         if (fd != -1)
3914                         {
3915                                 if (!expectedTLEs)
3916                                         expectedTLEs = tles;
3917                                 return fd;
3918                         }
3919                 }
3920         }
3921
3922         /* Couldn't find it.  For simplicity, complain about front timeline */
3923         XLogFilePath(path, recoveryTargetTLI, segno);
3924         errno = ENOENT;
3925         ereport(emode,
3926                         (errcode_for_file_access(),
3927                          errmsg("could not open file \"%s\": %m", path)));
3928         return -1;
3929 }
3930
3931 /*
3932  * Close the current logfile segment for writing.
3933  */
3934 static void
3935 XLogFileClose(void)
3936 {
3937         Assert(openLogFile >= 0);
3938
3939         /*
3940          * WAL segment files will not be re-read in normal operation, so we advise
3941          * the OS to release any cached pages.  But do not do so if WAL archiving
3942          * or streaming is active, because archiver and walsender process could
3943          * use the cache to read the WAL segment.
3944          */
3945 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3946         if (!XLogIsNeeded())
3947                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3948 #endif
3949
3950         if (close(openLogFile))
3951                 ereport(PANIC,
3952                                 (errcode_for_file_access(),
3953                                  errmsg("could not close log file %s: %m",
3954                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3955         openLogFile = -1;
3956 }
3957
3958 /*
3959  * Preallocate log files beyond the specified log endpoint.
3960  *
3961  * XXX this is currently extremely conservative, since it forces only one
3962  * future log segment to exist, and even that only if we are 75% done with
3963  * the current one.  This is only appropriate for very low-WAL-volume systems.
3964  * High-volume systems will be OK once they've built up a sufficient set of
3965  * recycled log segments, but the startup transient is likely to include
3966  * a lot of segment creations by foreground processes, which is not so good.
3967  */
3968 static void
3969 PreallocXlogFiles(XLogRecPtr endptr)
3970 {
3971         XLogSegNo       _logSegNo;
3972         int                     lf;
3973         bool            use_existent;
3974
3975         XLByteToPrevSeg(endptr, _logSegNo);
3976         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3977         {
3978                 _logSegNo++;
3979                 use_existent = true;
3980                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3981                 close(lf);
3982                 if (!use_existent)
3983                         CheckpointStats.ckpt_segs_added++;
3984         }
3985 }
3986
3987 /*
3988  * Throws an error if the given log segment has already been removed or
3989  * recycled. The caller should only pass a segment that it knows to have
3990  * existed while the server has been running, as this function always
3991  * succeeds if no WAL segments have been removed since startup.
3992  * 'tli' is only used in the error message.
3993  */
3994 void
3995 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3996 {
3997         /* use volatile pointer to prevent code rearrangement */
3998         volatile XLogCtlData *xlogctl = XLogCtl;
3999         XLogSegNo       lastRemovedSegNo;
4000
4001         SpinLockAcquire(&xlogctl->info_lck);
4002         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
4003         SpinLockRelease(&xlogctl->info_lck);
4004
4005         if (segno <= lastRemovedSegNo)
4006         {
4007                 char            filename[MAXFNAMELEN];
4008
4009                 XLogFileName(filename, tli, segno);
4010                 ereport(ERROR,
4011                                 (errcode_for_file_access(),
4012                                  errmsg("requested WAL segment %s has already been removed",
4013                                                 filename)));
4014         }
4015 }
4016
4017 /*
4018  * Update the last removed segno pointer in shared memory, to reflect
4019  * that the given XLOG file has been removed.
4020  */
4021 static void
4022 UpdateLastRemovedPtr(char *filename)
4023 {
4024         /* use volatile pointer to prevent code rearrangement */
4025         volatile XLogCtlData *xlogctl = XLogCtl;
4026         uint32          tli;
4027         XLogSegNo       segno;
4028
4029         XLogFromFileName(filename, &tli, &segno);
4030
4031         SpinLockAcquire(&xlogctl->info_lck);
4032         if (segno > xlogctl->lastRemovedSegNo)
4033                 xlogctl->lastRemovedSegNo = segno;
4034         SpinLockRelease(&xlogctl->info_lck);
4035 }
4036
4037 /*
4038  * Recycle or remove all log files older or equal to passed segno
4039  *
4040  * endptr is current (or recent) end of xlog; this is used to determine
4041  * whether we want to recycle rather than delete no-longer-wanted log files.
4042  */
4043 static void
4044 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
4045 {
4046         XLogSegNo       endlogSegNo;
4047         int                     max_advance;
4048         DIR                *xldir;
4049         struct dirent *xlde;
4050         char            lastoff[MAXFNAMELEN];
4051         char            path[MAXPGPATH];
4052
4053 #ifdef WIN32
4054         char            newpath[MAXPGPATH];
4055 #endif
4056         struct stat statbuf;
4057
4058         /*
4059          * Initialize info about where to try to recycle to.  We allow recycling
4060          * segments up to XLOGfileslop segments beyond the current XLOG location.
4061          */
4062         XLByteToPrevSeg(endptr, endlogSegNo);
4063         max_advance = XLOGfileslop;
4064
4065         xldir = AllocateDir(XLOGDIR);
4066         if (xldir == NULL)
4067                 ereport(ERROR,
4068                                 (errcode_for_file_access(),
4069                                  errmsg("could not open transaction log directory \"%s\": %m",
4070                                                 XLOGDIR)));
4071
4072         /*
4073          * Construct a filename of the last segment to be kept. The timeline ID
4074          * doesn't matter, we ignore that in the comparison. (During recovery,
4075          * ThisTimeLineID isn't set, so we can't use that.)
4076          */
4077         XLogFileName(lastoff, 0, segno);
4078
4079         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4080                  lastoff);
4081
4082         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4083         {
4084                 /*
4085                  * We ignore the timeline part of the XLOG segment identifiers in
4086                  * deciding whether a segment is still needed.  This ensures that we
4087                  * won't prematurely remove a segment from a parent timeline. We could
4088                  * probably be a little more proactive about removing segments of
4089                  * non-parent timelines, but that would be a whole lot more
4090                  * complicated.
4091                  *
4092                  * We use the alphanumeric sorting property of the filenames to decide
4093                  * which ones are earlier than the lastoff segment.
4094                  */
4095                 if (strlen(xlde->d_name) == 24 &&
4096                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4097                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4098                 {
4099                         if (XLogArchiveCheckDone(xlde->d_name))
4100                         {
4101                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4102
4103                                 /* Update the last removed location in shared memory first */
4104                                 UpdateLastRemovedPtr(xlde->d_name);
4105
4106                                 /*
4107                                  * Before deleting the file, see if it can be recycled as a
4108                                  * future log segment. Only recycle normal files, pg_standby
4109                                  * for example can create symbolic links pointing to a
4110                                  * separate archive directory.
4111                                  */
4112                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4113                                         InstallXLogFileSegment(&endlogSegNo, path,
4114                                                                                    true, &max_advance, true))
4115                                 {
4116                                         ereport(DEBUG2,
4117                                                         (errmsg("recycled transaction log file \"%s\"",
4118                                                                         xlde->d_name)));
4119                                         CheckpointStats.ckpt_segs_recycled++;
4120                                         /* Needn't recheck that slot on future iterations */
4121                                         if (max_advance > 0)
4122                                         {
4123                                                 endlogSegNo++;
4124                                                 max_advance--;
4125                                         }
4126                                 }
4127                                 else
4128                                 {
4129                                         /* No need for any more future segments... */
4130                                         int                     rc;
4131
4132                                         ereport(DEBUG2,
4133                                                         (errmsg("removing transaction log file \"%s\"",
4134                                                                         xlde->d_name)));
4135
4136 #ifdef WIN32
4137
4138                                         /*
4139                                          * On Windows, if another process (e.g another backend)
4140                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
4141                                          * will succeed, but the file will still show up in
4142                                          * directory listing until the last handle is closed. To
4143                                          * avoid confusing the lingering deleted file for a live
4144                                          * WAL file that needs to be archived, rename it before
4145                                          * deleting it.
4146                                          *
4147                                          * If another process holds the file open without
4148                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
4149                                          * again at the next checkpoint.
4150                                          */
4151                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4152                                         if (rename(path, newpath) != 0)
4153                                         {
4154                                                 ereport(LOG,
4155                                                                 (errcode_for_file_access(),
4156                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
4157                                                                                 path)));
4158                                                 continue;
4159                                         }
4160                                         rc = unlink(newpath);
4161 #else
4162                                         rc = unlink(path);
4163 #endif
4164                                         if (rc != 0)
4165                                         {
4166                                                 ereport(LOG,
4167                                                                 (errcode_for_file_access(),
4168                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
4169                                                                                 path)));
4170                                                 continue;
4171                                         }
4172                                         CheckpointStats.ckpt_segs_removed++;
4173                                 }
4174
4175                                 XLogArchiveCleanup(xlde->d_name);
4176                         }
4177                 }
4178         }
4179
4180         FreeDir(xldir);
4181 }
4182
4183 /*
4184  * Verify whether pg_xlog and pg_xlog/archive_status exist.
4185  * If the latter does not exist, recreate it.
4186  *
4187  * It is not the goal of this function to verify the contents of these
4188  * directories, but to help in cases where someone has performed a cluster
4189  * copy for PITR purposes but omitted pg_xlog from the copy.
4190  *
4191  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
4192  * policy decision was made not to.  It is fairly common for pg_xlog to be
4193  * a symlink, and if that was the DBA's intent then automatically making a
4194  * plain directory would result in degraded performance with no notice.
4195  */
4196 static void
4197 ValidateXLOGDirectoryStructure(void)
4198 {
4199         char            path[MAXPGPATH];
4200         struct stat stat_buf;
4201
4202         /* Check for pg_xlog; if it doesn't exist, error out */
4203         if (stat(XLOGDIR, &stat_buf) != 0 ||
4204                 !S_ISDIR(stat_buf.st_mode))
4205                 ereport(FATAL,
4206                                 (errmsg("required WAL directory \"%s\" does not exist",
4207                                                 XLOGDIR)));
4208
4209         /* Check for archive_status */
4210         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4211         if (stat(path, &stat_buf) == 0)
4212         {
4213                 /* Check for weird cases where it exists but isn't a directory */
4214                 if (!S_ISDIR(stat_buf.st_mode))
4215                         ereport(FATAL,
4216                                         (errmsg("required WAL directory \"%s\" does not exist",
4217                                                         path)));
4218         }
4219         else
4220         {
4221                 ereport(LOG,
4222                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4223                 if (mkdir(path, S_IRWXU) < 0)
4224                         ereport(FATAL,
4225                                         (errmsg("could not create missing directory \"%s\": %m",
4226                                                         path)));
4227         }
4228 }
4229
4230 /*
4231  * Remove previous backup history files.  This also retries creation of
4232  * .ready files for any backup history files for which XLogArchiveNotify
4233  * failed earlier.
4234  */
4235 static void
4236 CleanupBackupHistory(void)
4237 {
4238         DIR                *xldir;
4239         struct dirent *xlde;
4240         char            path[MAXPGPATH];
4241
4242         xldir = AllocateDir(XLOGDIR);
4243         if (xldir == NULL)
4244                 ereport(ERROR,
4245                                 (errcode_for_file_access(),
4246                                  errmsg("could not open transaction log directory \"%s\": %m",
4247                                                 XLOGDIR)));
4248
4249         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4250         {
4251                 if (strlen(xlde->d_name) > 24 &&
4252                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4253                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
4254                                    ".backup") == 0)
4255                 {
4256                         if (XLogArchiveCheckDone(xlde->d_name))
4257                         {
4258                                 ereport(DEBUG2,
4259                                 (errmsg("removing transaction log backup history file \"%s\"",
4260                                                 xlde->d_name)));
4261                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4262                                 unlink(path);
4263                                 XLogArchiveCleanup(xlde->d_name);
4264                         }
4265                 }
4266         }
4267
4268         FreeDir(xldir);
4269 }
4270
4271 /*
4272  * Restore a full-page image from a backup block attached to an XLOG record.
4273  *
4274  * lsn: LSN of the XLOG record being replayed
4275  * record: the complete XLOG record
4276  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
4277  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
4278  * keep_buffer: TRUE to return the buffer still locked and pinned
4279  *
4280  * Returns the buffer number containing the page.  Note this is not terribly
4281  * useful unless keep_buffer is specified as TRUE.
4282  *
4283  * Note: when a backup block is available in XLOG, we restore it
4284  * unconditionally, even if the page in the database appears newer.
4285  * This is to protect ourselves against database pages that were partially
4286  * or incorrectly written during a crash.  We assume that the XLOG data
4287  * must be good because it has passed a CRC check, while the database
4288  * page might not be.  This will force us to replay all subsequent
4289  * modifications of the page that appear in XLOG, rather than possibly
4290  * ignoring them as already applied, but that's not a huge drawback.
4291  *
4292  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
4293  * else a normal exclusive lock is used.  During crash recovery, that's just
4294  * pro forma because there can't be any regular backends in the system, but
4295  * in hot standby mode the distinction is important.
4296  *
4297  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
4298  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
4299  * is needed in some cases when replaying XLOG records that touch multiple
4300  * pages, to prevent inconsistent states from being visible to other backends.
4301  * (Again, that's only important in hot standby mode.)
4302  */
4303 Buffer
4304 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
4305                                    bool get_cleanup_lock, bool keep_buffer)
4306 {
4307         BkpBlock        bkpb;
4308         char       *blk;
4309         int                     i;
4310
4311         /* Locate requested BkpBlock in the record */
4312         blk = (char *) XLogRecGetData(record) + record->xl_len;
4313         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
4314         {
4315                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
4316                         continue;
4317
4318                 memcpy(&bkpb, blk, sizeof(BkpBlock));
4319                 blk += sizeof(BkpBlock);
4320
4321                 if (i == block_index)
4322                 {
4323                         /* Found it, apply the update */
4324                         return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
4325                                                                                           keep_buffer);
4326                 }
4327
4328                 blk += BLCKSZ - bkpb.hole_length;
4329         }
4330
4331         /* Caller specified a bogus block_index */
4332         elog(ERROR, "failed to restore block_index %d", block_index);
4333         return InvalidBuffer;           /* keep compiler quiet */
4334 }
4335
4336 /*
4337  * Workhorse for RestoreBackupBlock usable without an xlog record
4338  *
4339  * Restores a full-page image from BkpBlock and a data pointer.
4340  */
4341 static Buffer
4342 RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
4343                                                    bool get_cleanup_lock, bool keep_buffer)
4344 {
4345         Buffer          buffer;
4346         Page            page;
4347
4348         buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
4349                                                                         RBM_ZERO);
4350         Assert(BufferIsValid(buffer));
4351         if (get_cleanup_lock)
4352                 LockBufferForCleanup(buffer);
4353         else
4354                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4355
4356         page = (Page) BufferGetPage(buffer);
4357
4358         if (bkpb.hole_length == 0)
4359         {
4360                 memcpy((char *) page, blk, BLCKSZ);
4361         }
4362         else
4363         {
4364                 memcpy((char *) page, blk, bkpb.hole_offset);
4365                 /* must zero-fill the hole */
4366                 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
4367                 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
4368                            blk + bkpb.hole_offset,
4369                            BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
4370         }
4371
4372         /*
4373          * The checksum value on this page is currently invalid. We don't need to
4374          * reset it here since it will be set before being written.
4375          */
4376
4377         PageSetLSN(page, lsn);
4378         MarkBufferDirty(buffer);
4379
4380         if (!keep_buffer)
4381                 UnlockReleaseBuffer(buffer);
4382
4383         return buffer;
4384 }
4385
4386 /*
4387  * Attempt to read an XLOG record.
4388  *
4389  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
4390  * try to read a record just after the last one previously read.
4391  *
4392  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4393  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4394  * record is available.
4395  *
4396  * The record is copied into readRecordBuf, so that on successful return,
4397  * the returned record pointer always points there.
4398  */
4399 static XLogRecord *
4400 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4401                    bool fetching_ckpt)
4402 {
4403         XLogRecord *record;
4404         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4405
4406         /* Pass through parameters to XLogPageRead */
4407         private->fetching_ckpt = fetching_ckpt;
4408         private->emode = emode;
4409         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4410
4411         /* This is the first attempt to read this page. */
4412         lastSourceFailed = false;
4413
4414         for (;;)
4415         {
4416                 char       *errormsg;
4417
4418                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4419                 ReadRecPtr = xlogreader->ReadRecPtr;
4420                 EndRecPtr = xlogreader->EndRecPtr;
4421                 if (record == NULL)
4422                 {
4423                         if (readFile >= 0)
4424                         {
4425                                 close(readFile);
4426                                 readFile = -1;
4427                         }
4428
4429                         /*
4430                          * We only end up here without a message when XLogPageRead()
4431                          * failed - in that case we already logged something. In
4432                          * StandbyMode that only happens if we have been triggered, so we
4433                          * shouldn't loop anymore in that case.
4434                          */
4435                         if (errormsg)
4436                                 ereport(emode_for_corrupt_record(emode,
4437                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4438                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4439                 }
4440
4441                 /*
4442                  * Check page TLI is one of the expected values.
4443                  */
4444                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4445                 {
4446                         char            fname[MAXFNAMELEN];
4447                         XLogSegNo       segno;
4448                         int32           offset;
4449
4450                         XLByteToSeg(xlogreader->latestPagePtr, segno);
4451                         offset = xlogreader->latestPagePtr % XLogSegSize;
4452                         XLogFileName(fname, xlogreader->readPageTLI, segno);
4453                         ereport(emode_for_corrupt_record(emode,
4454                                                                                          RecPtr ? RecPtr : EndRecPtr),
4455                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4456                                         xlogreader->latestPageTLI,
4457                                         fname,
4458                                         offset)));
4459                         record = NULL;
4460                 }
4461
4462                 if (record)
4463                 {
4464                         /* Great, got a record */
4465                         return record;
4466                 }
4467                 else
4468                 {
4469                         /* No valid record available from this source */
4470                         lastSourceFailed = true;
4471
4472                         /*
4473                          * If archive recovery was requested, but we were still doing
4474                          * crash recovery, switch to archive recovery and retry using the
4475                          * offline archive. We have now replayed all the valid WAL in
4476                          * pg_xlog, so we are presumably now consistent.
4477                          *
4478                          * We require that there's at least some valid WAL present in
4479                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
4480                          * from the archive, even if pg_xlog is completely empty, but we'd
4481                          * have no idea how far we'd have to replay to reach consistency.
4482                          * So err on the safe side and give up.
4483                          */
4484                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4485                                 !fetching_ckpt)
4486                         {
4487                                 ereport(DEBUG1,
4488                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
4489                                 InArchiveRecovery = true;
4490                                 if (StandbyModeRequested)
4491                                         StandbyMode = true;
4492
4493                                 /* initialize minRecoveryPoint to this record */
4494                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4495                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4496                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4497                                 {
4498                                         ControlFile->minRecoveryPoint = EndRecPtr;
4499                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4500                                 }
4501                                 /* update local copy */
4502                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4503                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4504
4505                                 UpdateControlFile();
4506                                 LWLockRelease(ControlFileLock);
4507
4508                                 CheckRecoveryConsistency();
4509
4510                                 /*
4511                                  * Before we retry, reset lastSourceFailed and currentSource
4512                                  * so that we will check the archive next.
4513                                  */
4514                                 lastSourceFailed = false;
4515                                 currentSource = 0;
4516
4517                                 continue;
4518                         }
4519
4520                         /* In standby mode, loop back to retry. Otherwise, give up. */
4521                         if (StandbyMode && !CheckForStandbyTrigger())
4522                                 continue;
4523                         else
4524                                 return NULL;
4525                 }
4526         }
4527 }
4528
4529 /*
4530  * Scan for new timelines that might have appeared in the archive since we
4531  * started recovery.
4532  *
4533  * If there are any, the function changes recovery target TLI to the latest
4534  * one and returns 'true'.
4535  */
4536 static bool
4537 rescanLatestTimeLine(void)
4538 {
4539         List       *newExpectedTLEs;
4540         bool            found;
4541         ListCell   *cell;
4542         TimeLineID      newtarget;
4543         TimeLineID      oldtarget = recoveryTargetTLI;
4544         TimeLineHistoryEntry *currentTle = NULL;
4545
4546         newtarget = findNewestTimeLine(recoveryTargetTLI);
4547         if (newtarget == recoveryTargetTLI)
4548         {
4549                 /* No new timelines found */
4550                 return false;
4551         }
4552
4553         /*
4554          * Determine the list of expected TLIs for the new TLI
4555          */
4556
4557         newExpectedTLEs = readTimeLineHistory(newtarget);
4558
4559         /*
4560          * If the current timeline is not part of the history of the new timeline,
4561          * we cannot proceed to it.
4562          */
4563         found = false;
4564         foreach(cell, newExpectedTLEs)
4565         {
4566                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4567
4568                 if (currentTle->tli == recoveryTargetTLI)
4569                 {
4570                         found = true;
4571                         break;
4572                 }
4573         }
4574         if (!found)
4575         {
4576                 ereport(LOG,
4577                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4578                                                 newtarget,
4579                                                 ThisTimeLineID)));
4580                 return false;
4581         }
4582
4583         /*
4584          * The current timeline was found in the history file, but check that the
4585          * next timeline was forked off from it *after* the current recovery
4586          * location.
4587          */
4588         if (currentTle->end < EndRecPtr)
4589         {
4590                 ereport(LOG,
4591                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4592                                                 newtarget,
4593                                                 ThisTimeLineID,
4594                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4595                 return false;
4596         }
4597
4598         /* The new timeline history seems valid. Switch target */
4599         recoveryTargetTLI = newtarget;
4600         list_free_deep(expectedTLEs);
4601         expectedTLEs = newExpectedTLEs;
4602
4603         /*
4604          * As in StartupXLOG(), try to ensure we have all the history files
4605          * between the old target and new target in pg_xlog.
4606          */
4607         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4608
4609         ereport(LOG,
4610                         (errmsg("new target timeline is %u",
4611                                         recoveryTargetTLI)));
4612
4613         return true;
4614 }
4615
4616 /*
4617  * I/O routines for pg_control
4618  *
4619  * *ControlFile is a buffer in shared memory that holds an image of the
4620  * contents of pg_control.      WriteControlFile() initializes pg_control
4621  * given a preloaded buffer, ReadControlFile() loads the buffer from
4622  * the pg_control file (during postmaster or standalone-backend startup),
4623  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4624  *
4625  * For simplicity, WriteControlFile() initializes the fields of pg_control
4626  * that are related to checking backend/database compatibility, and
4627  * ReadControlFile() verifies they are correct.  We could split out the
4628  * I/O and compatibility-check functions, but there seems no need currently.
4629  */
4630 static void
4631 WriteControlFile(void)
4632 {
4633         int                     fd;
4634         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4635
4636         /*
4637          * Initialize version and compatibility-check fields
4638          */
4639         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4640         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4641
4642         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4643         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4644
4645         ControlFile->blcksz = BLCKSZ;
4646         ControlFile->relseg_size = RELSEG_SIZE;
4647         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4648         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4649
4650         ControlFile->nameDataLen = NAMEDATALEN;
4651         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4652
4653         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4654
4655 #ifdef HAVE_INT64_TIMESTAMP
4656         ControlFile->enableIntTimes = true;
4657 #else
4658         ControlFile->enableIntTimes = false;
4659 #endif
4660         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4661         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4662
4663         /* Contents are protected with a CRC */
4664         INIT_CRC32(ControlFile->crc);
4665         COMP_CRC32(ControlFile->crc,
4666                            (char *) ControlFile,
4667                            offsetof(ControlFileData, crc));
4668         FIN_CRC32(ControlFile->crc);
4669
4670         /*
4671          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4672          * excess over sizeof(ControlFileData).  This reduces the odds of
4673          * premature-EOF errors when reading pg_control.  We'll still fail when we
4674          * check the contents of the file, but hopefully with a more specific
4675          * error than "couldn't read pg_control".
4676          */
4677         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4678                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4679
4680         memset(buffer, 0, PG_CONTROL_SIZE);
4681         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4682
4683         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4684                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4685                                            S_IRUSR | S_IWUSR);
4686         if (fd < 0)
4687                 ereport(PANIC,
4688                                 (errcode_for_file_access(),
4689                                  errmsg("could not create control file \"%s\": %m",
4690                                                 XLOG_CONTROL_FILE)));
4691
4692         errno = 0;
4693         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4694         {
4695                 /* if write didn't set errno, assume problem is no disk space */
4696                 if (errno == 0)
4697                         errno = ENOSPC;
4698                 ereport(PANIC,
4699                                 (errcode_for_file_access(),
4700                                  errmsg("could not write to control file: %m")));
4701         }
4702
4703         if (pg_fsync(fd) != 0)
4704                 ereport(PANIC,
4705                                 (errcode_for_file_access(),
4706                                  errmsg("could not fsync control file: %m")));
4707
4708         if (close(fd))
4709                 ereport(PANIC,
4710                                 (errcode_for_file_access(),
4711                                  errmsg("could not close control file: %m")));
4712 }
4713
4714 static void
4715 ReadControlFile(void)
4716 {
4717         pg_crc32        crc;
4718         int                     fd;
4719
4720         /*
4721          * Read data...
4722          */
4723         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4724                                            O_RDWR | PG_BINARY,
4725                                            S_IRUSR | S_IWUSR);
4726         if (fd < 0)
4727                 ereport(PANIC,
4728                                 (errcode_for_file_access(),
4729                                  errmsg("could not open control file \"%s\": %m",
4730                                                 XLOG_CONTROL_FILE)));
4731
4732         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4733                 ereport(PANIC,
4734                                 (errcode_for_file_access(),
4735                                  errmsg("could not read from control file: %m")));
4736
4737         close(fd);
4738
4739         /*
4740          * Check for expected pg_control format version.  If this is wrong, the
4741          * CRC check will likely fail because we'll be checking the wrong number
4742          * of bytes.  Complaining about wrong version will probably be more
4743          * enlightening than complaining about wrong CRC.
4744          */
4745
4746         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4747                 ereport(FATAL,
4748                                 (errmsg("database files are incompatible with server"),
4749                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4750                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4751                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4752                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4753                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4754
4755         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4756                 ereport(FATAL,
4757                                 (errmsg("database files are incompatible with server"),
4758                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4759                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4760                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4761                                  errhint("It looks like you need to initdb.")));
4762
4763         /* Now check the CRC. */
4764         INIT_CRC32(crc);
4765         COMP_CRC32(crc,
4766                            (char *) ControlFile,
4767                            offsetof(ControlFileData, crc));
4768         FIN_CRC32(crc);
4769
4770         if (!EQ_CRC32(crc, ControlFile->crc))
4771                 ereport(FATAL,
4772                                 (errmsg("incorrect checksum in control file")));
4773
4774         /*
4775          * Do compatibility checking immediately.  If the database isn't
4776          * compatible with the backend executable, we want to abort before we can
4777          * possibly do any damage.
4778          */
4779         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4780                 ereport(FATAL,
4781                                 (errmsg("database files are incompatible with server"),
4782                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4783                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4784                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4785                                  errhint("It looks like you need to initdb.")));
4786         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4787                 ereport(FATAL,
4788                                 (errmsg("database files are incompatible with server"),
4789                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4790                                          " but the server was compiled with MAXALIGN %d.",
4791                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4792                                  errhint("It looks like you need to initdb.")));
4793         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4794                 ereport(FATAL,
4795                                 (errmsg("database files are incompatible with server"),
4796                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4797                                  errhint("It looks like you need to initdb.")));
4798         if (ControlFile->blcksz != BLCKSZ)
4799                 ereport(FATAL,
4800                                 (errmsg("database files are incompatible with server"),
4801                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4802                                            " but the server was compiled with BLCKSZ %d.",
4803                                            ControlFile->blcksz, BLCKSZ),
4804                                  errhint("It looks like you need to recompile or initdb.")));
4805         if (ControlFile->relseg_size != RELSEG_SIZE)
4806                 ereport(FATAL,
4807                                 (errmsg("database files are incompatible with server"),
4808                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4809                                   " but the server was compiled with RELSEG_SIZE %d.",
4810                                   ControlFile->relseg_size, RELSEG_SIZE),
4811                                  errhint("It looks like you need to recompile or initdb.")));
4812         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4813                 ereport(FATAL,
4814                                 (errmsg("database files are incompatible with server"),
4815                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4816                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4817                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4818                                  errhint("It looks like you need to recompile or initdb.")));
4819         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4820                 ereport(FATAL,
4821                                 (errmsg("database files are incompatible with server"),
4822                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4823                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4824                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4825                                  errhint("It looks like you need to recompile or initdb.")));
4826         if (ControlFile->nameDataLen != NAMEDATALEN)
4827                 ereport(FATAL,
4828                                 (errmsg("database files are incompatible with server"),
4829                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4830                                   " but the server was compiled with NAMEDATALEN %d.",
4831                                   ControlFile->nameDataLen, NAMEDATALEN),
4832                                  errhint("It looks like you need to recompile or initdb.")));
4833         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4834                 ereport(FATAL,
4835                                 (errmsg("database files are incompatible with server"),
4836                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4837                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4838                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4839                                  errhint("It looks like you need to recompile or initdb.")));
4840         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4841                 ereport(FATAL,
4842                                 (errmsg("database files are incompatible with server"),
4843                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4844                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4845                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4846                                  errhint("It looks like you need to recompile or initdb.")));
4847
4848 #ifdef HAVE_INT64_TIMESTAMP
4849         if (ControlFile->enableIntTimes != true)
4850                 ereport(FATAL,
4851                                 (errmsg("database files are incompatible with server"),
4852                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4853                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4854                                  errhint("It looks like you need to recompile or initdb.")));
4855 #else
4856         if (ControlFile->enableIntTimes != false)
4857                 ereport(FATAL,
4858                                 (errmsg("database files are incompatible with server"),
4859                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4860                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4861                                  errhint("It looks like you need to recompile or initdb.")));
4862 #endif
4863
4864 #ifdef USE_FLOAT4_BYVAL
4865         if (ControlFile->float4ByVal != true)
4866                 ereport(FATAL,
4867                                 (errmsg("database files are incompatible with server"),
4868                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4869                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4870                                  errhint("It looks like you need to recompile or initdb.")));
4871 #else
4872         if (ControlFile->float4ByVal != false)
4873                 ereport(FATAL,
4874                                 (errmsg("database files are incompatible with server"),
4875                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4876                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4877                                  errhint("It looks like you need to recompile or initdb.")));
4878 #endif
4879
4880 #ifdef USE_FLOAT8_BYVAL
4881         if (ControlFile->float8ByVal != true)
4882                 ereport(FATAL,
4883                                 (errmsg("database files are incompatible with server"),
4884                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4885                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4886                                  errhint("It looks like you need to recompile or initdb.")));
4887 #else
4888         if (ControlFile->float8ByVal != false)
4889                 ereport(FATAL,
4890                                 (errmsg("database files are incompatible with server"),
4891                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4892                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4893                                  errhint("It looks like you need to recompile or initdb.")));
4894 #endif
4895
4896         /* Make the fixed  settings visible as GUC variables, too */
4897         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4898                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4899 }
4900
4901 void
4902 UpdateControlFile(void)
4903 {
4904         int                     fd;
4905
4906         INIT_CRC32(ControlFile->crc);
4907         COMP_CRC32(ControlFile->crc,
4908                            (char *) ControlFile,
4909                            offsetof(ControlFileData, crc));
4910         FIN_CRC32(ControlFile->crc);
4911
4912         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4913                                            O_RDWR | PG_BINARY,
4914                                            S_IRUSR | S_IWUSR);
4915         if (fd < 0)
4916                 ereport(PANIC,
4917                                 (errcode_for_file_access(),
4918                                  errmsg("could not open control file \"%s\": %m",
4919                                                 XLOG_CONTROL_FILE)));
4920
4921         errno = 0;
4922         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4923         {
4924                 /* if write didn't set errno, assume problem is no disk space */
4925                 if (errno == 0)
4926                         errno = ENOSPC;
4927                 ereport(PANIC,
4928                                 (errcode_for_file_access(),
4929                                  errmsg("could not write to control file: %m")));
4930         }
4931
4932         if (pg_fsync(fd) != 0)
4933                 ereport(PANIC,
4934                                 (errcode_for_file_access(),
4935                                  errmsg("could not fsync control file: %m")));
4936
4937         if (close(fd))
4938                 ereport(PANIC,
4939                                 (errcode_for_file_access(),
4940                                  errmsg("could not close control file: %m")));
4941 }
4942
4943 /*
4944  * Returns the unique system identifier from control file.
4945  */
4946 uint64
4947 GetSystemIdentifier(void)
4948 {
4949         Assert(ControlFile != NULL);
4950         return ControlFile->system_identifier;
4951 }
4952
4953 /*
4954  * Are checksums enabled for data pages?
4955  */
4956 bool
4957 DataChecksumsEnabled(void)
4958 {
4959         Assert(ControlFile != NULL);
4960         return (ControlFile->data_checksum_version > 0);
4961 }
4962
4963 /*
4964  * Returns a fake LSN for unlogged relations.
4965  *
4966  * Each call generates an LSN that is greater than any previous value
4967  * returned. The current counter value is saved and restored across clean
4968  * shutdowns, but like unlogged relations, does not survive a crash. This can
4969  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4970  * LSN-like increasing sequence of numbers without writing any WAL.
4971  */
4972 XLogRecPtr
4973 GetFakeLSNForUnloggedRel(void)
4974 {
4975         XLogRecPtr      nextUnloggedLSN;
4976
4977         /* use volatile pointer to prevent code rearrangement */
4978         volatile XLogCtlData *xlogctl = XLogCtl;
4979
4980         /* increment the unloggedLSN counter, need SpinLock */
4981         SpinLockAcquire(&xlogctl->ulsn_lck);
4982         nextUnloggedLSN = xlogctl->unloggedLSN++;
4983         SpinLockRelease(&xlogctl->ulsn_lck);
4984
4985         return nextUnloggedLSN;
4986 }
4987
4988 /*
4989  * Auto-tune the number of XLOG buffers.
4990  *
4991  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4992  * a maximum of one XLOG segment (there is little reason to think that more
4993  * is helpful, at least so long as we force an fsync when switching log files)
4994  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4995  * 9.1, when auto-tuning was added).
4996  *
4997  * This should not be called until NBuffers has received its final value.
4998  */
4999 static int
5000 XLOGChooseNumBuffers(void)
5001 {
5002         int                     xbuffers;
5003
5004         xbuffers = NBuffers / 32;
5005         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
5006                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
5007         if (xbuffers < 8)
5008                 xbuffers = 8;
5009         return xbuffers;
5010 }
5011
5012 /*
5013  * GUC check_hook for wal_buffers
5014  */
5015 bool
5016 check_wal_buffers(int *newval, void **extra, GucSource source)
5017 {
5018         /*
5019          * -1 indicates a request for auto-tune.
5020          */
5021         if (*newval == -1)
5022         {
5023                 /*
5024                  * If we haven't yet changed the boot_val default of -1, just let it
5025                  * be.  We'll fix it when XLOGShmemSize is called.
5026                  */
5027                 if (XLOGbuffers == -1)
5028                         return true;
5029
5030                 /* Otherwise, substitute the auto-tune value */
5031                 *newval = XLOGChooseNumBuffers();
5032         }
5033
5034         /*
5035          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
5036          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
5037          * the case, we just silently treat such values as a request for the
5038          * minimum.  (We could throw an error instead, but that doesn't seem very
5039          * helpful.)
5040          */
5041         if (*newval < 4)
5042                 *newval = 4;
5043
5044         return true;
5045 }
5046
5047 /*
5048  * Initialization of shared memory for XLOG
5049  */
5050 Size
5051 XLOGShmemSize(void)
5052 {
5053         Size            size;
5054
5055         /*
5056          * If the value of wal_buffers is -1, use the preferred auto-tune value.
5057          * This isn't an amazingly clean place to do this, but we must wait till
5058          * NBuffers has received its final value, and must do it before using the
5059          * value of XLOGbuffers to do anything important.
5060          */
5061         if (XLOGbuffers == -1)
5062         {
5063                 char            buf[32];
5064
5065                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5066                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5067         }
5068         Assert(XLOGbuffers > 0);
5069
5070         /* XLogCtl */
5071         size = sizeof(XLogCtlData);
5072
5073         /* xlog insertion slots, plus alignment */
5074         size = add_size(size, mul_size(sizeof(XLogInsertSlotPadded), num_xloginsert_slots + 1));
5075         /* xlblocks array */
5076         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5077         /* extra alignment padding for XLOG I/O buffers */
5078         size = add_size(size, XLOG_BLCKSZ);
5079         /* and the buffers themselves */
5080         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5081
5082         /*
5083          * Note: we don't count ControlFileData, it comes out of the "slop factor"
5084          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5085          * routine again below to compute the actual allocation size.
5086          */
5087
5088         return size;
5089 }
5090
5091 void
5092 XLOGShmemInit(void)
5093 {
5094         bool            foundCFile,
5095                                 foundXLog;
5096         char       *allocptr;
5097         int                     i;
5098
5099         ControlFile = (ControlFileData *)
5100                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5101         XLogCtl = (XLogCtlData *)
5102                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5103
5104         if (foundCFile || foundXLog)
5105         {
5106                 /* both should be present or neither */
5107                 Assert(foundCFile && foundXLog);
5108                 return;
5109         }
5110         memset(XLogCtl, 0, sizeof(XLogCtlData));
5111
5112         /*
5113          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5114          * multiple of the alignment for same, so no extra alignment padding is
5115          * needed here.
5116          */
5117         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5118         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5119         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5120         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5121
5122         /* Xlog insertion slots. Ensure they're aligned to the full padded size */
5123         allocptr += sizeof(XLogInsertSlotPadded) -
5124                 ((uintptr_t) allocptr) % sizeof(XLogInsertSlotPadded);
5125         XLogCtl->Insert.insertSlots = (XLogInsertSlotPadded *) allocptr;
5126         allocptr += sizeof(XLogInsertSlotPadded) * num_xloginsert_slots;
5127
5128         /*
5129          * Align the start of the page buffers to a full xlog block size boundary.
5130          * This simplifies some calculations in XLOG insertion. It is also required
5131          * for O_DIRECT.
5132          */
5133         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5134         XLogCtl->pages = allocptr;
5135         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5136
5137         /*
5138          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5139          * in additional info.)
5140          */
5141         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5142         XLogCtl->SharedRecoveryInProgress = true;
5143         XLogCtl->SharedHotStandbyActive = false;
5144         XLogCtl->WalWriterSleeping = false;
5145
5146         for (i = 0; i < num_xloginsert_slots; i++)
5147         {
5148                 XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
5149                 SpinLockInit(&slot->mutex);
5150                 slot->xlogInsertingAt = InvalidXLogRecPtr;
5151                 slot->owner = NULL;
5152
5153                 slot->releaseOK = true;
5154                 slot->exclusive = 0;
5155                 slot->head = NULL;
5156                 slot->tail = NULL;
5157         }
5158
5159         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5160         SpinLockInit(&XLogCtl->info_lck);
5161         SpinLockInit(&XLogCtl->ulsn_lck);
5162         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5163
5164         /*
5165          * If we are not in bootstrap mode, pg_control should already exist. Read
5166          * and validate it immediately (see comments in ReadControlFile() for the
5167          * reasons why).
5168          */
5169         if (!IsBootstrapProcessingMode())
5170                 ReadControlFile();
5171 }
5172
5173 /*
5174  * This func must be called ONCE on system install.  It creates pg_control
5175  * and the initial XLOG segment.
5176  */
5177 void
5178 BootStrapXLOG(void)
5179 {
5180         CheckPoint      checkPoint;
5181         char       *buffer;
5182         XLogPageHeader page;
5183         XLogLongPageHeader longpage;
5184         XLogRecord *record;
5185         bool            use_existent;
5186         uint64          sysidentifier;
5187         struct timeval tv;
5188         pg_crc32        crc;
5189
5190         /*
5191          * Select a hopefully-unique system identifier code for this installation.
5192          * We use the result of gettimeofday(), including the fractional seconds
5193          * field, as being about as unique as we can easily get.  (Think not to
5194          * use random(), since it hasn't been seeded and there's no portable way
5195          * to seed it other than the system clock value...)  The upper half of the
5196          * uint64 value is just the tv_sec part, while the lower half is the XOR
5197          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
5198          * unnecessarily if "uint64" is really only 32 bits wide.  A person
5199          * knowing this encoding can determine the initialization time of the
5200          * installation, which could perhaps be useful sometimes.
5201          */
5202         gettimeofday(&tv, NULL);
5203         sysidentifier = ((uint64) tv.tv_sec) << 32;
5204         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
5205
5206         /* First timeline ID is always 1 */
5207         ThisTimeLineID = 1;
5208
5209         /* page buffer must be aligned suitably for O_DIRECT */
5210         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5211         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5212         memset(page, 0, XLOG_BLCKSZ);
5213
5214         /*
5215          * Set up information for the initial checkpoint record
5216          *
5217          * The initial checkpoint record is written to the beginning of the WAL
5218          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5219          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5220          */
5221         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
5222         checkPoint.ThisTimeLineID = ThisTimeLineID;
5223         checkPoint.PrevTimeLineID = ThisTimeLineID;
5224         checkPoint.fullPageWrites = fullPageWrites;
5225         checkPoint.nextXidEpoch = 0;
5226         checkPoint.nextXid = FirstNormalTransactionId;
5227         checkPoint.nextOid = FirstBootstrapObjectId;
5228         checkPoint.nextMulti = FirstMultiXactId;
5229         checkPoint.nextMultiOffset = 0;
5230         checkPoint.oldestXid = FirstNormalTransactionId;
5231         checkPoint.oldestXidDB = TemplateDbOid;
5232         checkPoint.oldestMulti = FirstMultiXactId;
5233         checkPoint.oldestMultiDB = TemplateDbOid;
5234         checkPoint.time = (pg_time_t) time(NULL);
5235         checkPoint.oldestActiveXid = InvalidTransactionId;
5236
5237         ShmemVariableCache->nextXid = checkPoint.nextXid;
5238         ShmemVariableCache->nextOid = checkPoint.nextOid;
5239         ShmemVariableCache->oidCount = 0;
5240         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5241         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5242         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
5243
5244         /* Set up the XLOG page header */
5245         page->xlp_magic = XLOG_PAGE_MAGIC;
5246         page->xlp_info = XLP_LONG_HEADER;
5247         page->xlp_tli = ThisTimeLineID;
5248         page->xlp_pageaddr = XLogSegSize;
5249         longpage = (XLogLongPageHeader) page;
5250         longpage->xlp_sysid = sysidentifier;
5251         longpage->xlp_seg_size = XLogSegSize;
5252         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5253
5254         /* Insert the initial checkpoint record */
5255         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
5256         record->xl_prev = 0;
5257         record->xl_xid = InvalidTransactionId;
5258         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
5259         record->xl_len = sizeof(checkPoint);
5260         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5261         record->xl_rmid = RM_XLOG_ID;
5262         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
5263
5264         INIT_CRC32(crc);
5265         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
5266         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5267         FIN_CRC32(crc);
5268         record->xl_crc = crc;
5269
5270         /* Create first XLOG segment file */
5271         use_existent = false;
5272         openLogFile = XLogFileInit(1, &use_existent, false);
5273
5274         /* Write the first page with the initial record */
5275         errno = 0;
5276         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5277         {
5278                 /* if write didn't set errno, assume problem is no disk space */
5279                 if (errno == 0)
5280                         errno = ENOSPC;
5281                 ereport(PANIC,
5282                                 (errcode_for_file_access(),
5283                           errmsg("could not write bootstrap transaction log file: %m")));
5284         }
5285
5286         if (pg_fsync(openLogFile) != 0)
5287                 ereport(PANIC,
5288                                 (errcode_for_file_access(),
5289                           errmsg("could not fsync bootstrap transaction log file: %m")));
5290
5291         if (close(openLogFile))
5292                 ereport(PANIC,
5293                                 (errcode_for_file_access(),
5294                           errmsg("could not close bootstrap transaction log file: %m")));
5295
5296         openLogFile = -1;
5297
5298         /* Now create pg_control */
5299
5300         memset(ControlFile, 0, sizeof(ControlFileData));
5301         /* Initialize pg_control status fields */
5302         ControlFile->system_identifier = sysidentifier;
5303         ControlFile->state = DB_SHUTDOWNED;
5304         ControlFile->time = checkPoint.time;
5305         ControlFile->checkPoint = checkPoint.redo;
5306         ControlFile->checkPointCopy = checkPoint;
5307         ControlFile->unloggedLSN = 1;
5308
5309         /* Set important parameter values for use when replaying WAL */
5310         ControlFile->MaxConnections = MaxConnections;
5311         ControlFile->max_worker_processes = max_worker_processes;
5312         ControlFile->max_prepared_xacts = max_prepared_xacts;
5313         ControlFile->max_locks_per_xact = max_locks_per_xact;
5314         ControlFile->wal_level = wal_level;
5315         ControlFile->wal_log_hints = wal_log_hints;
5316         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5317
5318         /* some additional ControlFile fields are set in WriteControlFile() */
5319
5320         WriteControlFile();
5321
5322         /* Bootstrap the commit log, too */
5323         BootStrapCLOG();
5324         BootStrapSUBTRANS();
5325         BootStrapMultiXact();
5326
5327         pfree(buffer);
5328 }
5329
5330 static char *
5331 str_time(pg_time_t tnow)
5332 {
5333         static char buf[128];
5334
5335         pg_strftime(buf, sizeof(buf),
5336                                 "%Y-%m-%d %H:%M:%S %Z",
5337                                 pg_localtime(&tnow, log_timezone));
5338
5339         return buf;
5340 }
5341
5342 /*
5343  * See if there is a recovery command file (recovery.conf), and if so
5344  * read in parameters for archive recovery and XLOG streaming.
5345  *
5346  * The file is parsed using the main configuration parser.
5347  */
5348 static void
5349 readRecoveryCommandFile(void)
5350 {
5351         FILE       *fd;
5352         TimeLineID      rtli = 0;
5353         bool            rtliGiven = false;
5354         ConfigVariable *item,
5355                            *head = NULL,
5356                            *tail = NULL;
5357
5358         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5359         if (fd == NULL)
5360         {
5361                 if (errno == ENOENT)
5362                         return;                         /* not there, so no archive recovery */
5363                 ereport(FATAL,
5364                                 (errcode_for_file_access(),
5365                                  errmsg("could not open recovery command file \"%s\": %m",
5366                                                 RECOVERY_COMMAND_FILE)));
5367         }
5368
5369         /*
5370          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5371          * no need to check the return value.
5372          */
5373         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5374
5375         FreeFile(fd);
5376
5377         for (item = head; item; item = item->next)
5378         {
5379                 if (strcmp(item->name, "restore_command") == 0)
5380                 {
5381                         recoveryRestoreCommand = pstrdup(item->value);
5382                         ereport(DEBUG2,
5383                                         (errmsg_internal("restore_command = '%s'",
5384                                                                          recoveryRestoreCommand)));
5385                 }
5386                 else if (strcmp(item->name, "recovery_end_command") == 0)
5387                 {
5388                         recoveryEndCommand = pstrdup(item->value);
5389                         ereport(DEBUG2,
5390                                         (errmsg_internal("recovery_end_command = '%s'",
5391                                                                          recoveryEndCommand)));
5392                 }
5393                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5394                 {
5395                         archiveCleanupCommand = pstrdup(item->value);
5396                         ereport(DEBUG2,
5397                                         (errmsg_internal("archive_cleanup_command = '%s'",
5398                                                                          archiveCleanupCommand)));
5399                 }
5400                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
5401                 {
5402                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
5403                                 ereport(ERROR,
5404                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5405                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
5406                         ereport(DEBUG2,
5407                                         (errmsg_internal("pause_at_recovery_target = '%s'",
5408                                                                          item->value)));
5409                 }
5410                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5411                 {
5412                         rtliGiven = true;
5413                         if (strcmp(item->value, "latest") == 0)
5414                                 rtli = 0;
5415                         else
5416                         {
5417                                 errno = 0;
5418                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5419                                 if (errno == EINVAL || errno == ERANGE)
5420                                         ereport(FATAL,
5421                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5422                                                                         item->value)));
5423                         }
5424                         if (rtli)
5425                                 ereport(DEBUG2,
5426                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
5427                         else
5428                                 ereport(DEBUG2,
5429                                          (errmsg_internal("recovery_target_timeline = latest")));
5430                 }
5431                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5432                 {
5433                         errno = 0;
5434                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5435                         if (errno == EINVAL || errno == ERANGE)
5436                                 ereport(FATAL,
5437                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5438                                                  item->value)));
5439                         ereport(DEBUG2,
5440                                         (errmsg_internal("recovery_target_xid = %u",
5441                                                                          recoveryTargetXid)));
5442                         recoveryTarget = RECOVERY_TARGET_XID;
5443                 }
5444                 else if (strcmp(item->name, "recovery_target_time") == 0)
5445                 {
5446                         recoveryTarget = RECOVERY_TARGET_TIME;
5447
5448                         /*
5449                          * Convert the time string given by the user to TimestampTz form.
5450                          */
5451                         recoveryTargetTime =
5452                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5453                                                                                                 CStringGetDatum(item->value),
5454                                                                                                 ObjectIdGetDatum(InvalidOid),
5455                                                                                                                 Int32GetDatum(-1)));
5456                         ereport(DEBUG2,
5457                                         (errmsg_internal("recovery_target_time = '%s'",
5458                                                                    timestamptz_to_str(recoveryTargetTime))));
5459                 }
5460                 else if (strcmp(item->name, "recovery_target_name") == 0)
5461                 {
5462                         recoveryTarget = RECOVERY_TARGET_NAME;
5463
5464                         recoveryTargetName = pstrdup(item->value);
5465                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5466                                 ereport(FATAL,
5467                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5468                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5469                                                                 MAXFNAMELEN - 1)));
5470
5471                         ereport(DEBUG2,
5472                                         (errmsg_internal("recovery_target_name = '%s'",
5473                                                                          recoveryTargetName)));
5474                 }
5475                 else if (strcmp(item->name, "recovery_target") == 0)
5476                 {
5477                         if (strcmp(item->value, "immediate") == 0)
5478                                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5479                         else
5480                                 ereport(ERROR,
5481                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5482                                                  errmsg("invalid recovery_target parameter"),
5483                                                  errhint("The only allowed value is 'immediate'")));
5484                         ereport(DEBUG2,
5485                                         (errmsg_internal("recovery_target = '%s'",
5486                                                                          item->value)));
5487                 }
5488                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5489                 {
5490                         /*
5491                          * does nothing if a recovery_target is not also set
5492                          */
5493                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5494                                 ereport(ERROR,
5495                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5496                                                  errmsg("parameter \"%s\" requires a Boolean value",
5497                                                                 "recovery_target_inclusive")));
5498                         ereport(DEBUG2,
5499                                         (errmsg_internal("recovery_target_inclusive = %s",
5500                                                                          item->value)));
5501                 }
5502                 else if (strcmp(item->name, "standby_mode") == 0)
5503                 {
5504                         if (!parse_bool(item->value, &StandbyModeRequested))
5505                                 ereport(ERROR,
5506                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5507                                                  errmsg("parameter \"%s\" requires a Boolean value",
5508                                                                 "standby_mode")));
5509                         ereport(DEBUG2,
5510                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5511                 }
5512                 else if (strcmp(item->name, "primary_conninfo") == 0)
5513                 {
5514                         PrimaryConnInfo = pstrdup(item->value);
5515                         ereport(DEBUG2,
5516                                         (errmsg_internal("primary_conninfo = '%s'",
5517                                                                          PrimaryConnInfo)));
5518                 }
5519                 else if (strcmp(item->name, "primary_slotname") == 0)
5520                 {
5521                         ReplicationSlotValidateName(item->value, ERROR);
5522                         PrimarySlotName = pstrdup(item->value);
5523                         ereport(DEBUG2,
5524                                         (errmsg_internal("primary_slotname = '%s'",
5525                                                                          PrimarySlotName)));
5526                 }
5527                 else if (strcmp(item->name, "trigger_file") == 0)
5528                 {
5529                         TriggerFile = pstrdup(item->value);
5530                         ereport(DEBUG2,
5531                                         (errmsg_internal("trigger_file = '%s'",
5532                                                                          TriggerFile)));
5533                 }
5534                 else if (strcmp(item->name, "min_recovery_apply_delay") == 0)
5535                 {
5536                         const char *hintmsg;
5537
5538                         if (!parse_int(item->value, &min_recovery_apply_delay, GUC_UNIT_MS,
5539                                         &hintmsg))
5540                                 ereport(ERROR,
5541                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5542                                                  errmsg("parameter \"%s\" requires a temporal value", "min_recovery_apply_delay"),
5543                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
5544                         ereport(DEBUG2,
5545                                         (errmsg("min_recovery_apply_delay = '%s'", item->value)));
5546                 }
5547                 else
5548                         ereport(FATAL,
5549                                         (errmsg("unrecognized recovery parameter \"%s\"",
5550                                                         item->name)));
5551         }
5552
5553         /*
5554          * Check for compulsory parameters
5555          */
5556         if (StandbyModeRequested)
5557         {
5558                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5559                         ereport(WARNING,
5560                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5561                                                         RECOVERY_COMMAND_FILE),
5562                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5563         }
5564         else
5565         {
5566                 if (recoveryRestoreCommand == NULL)
5567                         ereport(FATAL,
5568                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5569                                                         RECOVERY_COMMAND_FILE)));
5570         }
5571
5572         /* Enable fetching from archive recovery area */
5573         ArchiveRecoveryRequested = true;
5574
5575         /*
5576          * If user specified recovery_target_timeline, validate it or compute the
5577          * "latest" value.      We can't do this until after we've gotten the restore
5578          * command and set InArchiveRecovery, because we need to fetch timeline
5579          * history files from the archive.
5580          */
5581         if (rtliGiven)
5582         {
5583                 if (rtli)
5584                 {
5585                         /* Timeline 1 does not have a history file, all else should */
5586                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5587                                 ereport(FATAL,
5588                                                 (errmsg("recovery target timeline %u does not exist",
5589                                                                 rtli)));
5590                         recoveryTargetTLI = rtli;
5591                         recoveryTargetIsLatest = false;
5592                 }
5593                 else
5594                 {
5595                         /* We start the "latest" search from pg_control's timeline */
5596                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5597                         recoveryTargetIsLatest = true;
5598                 }
5599         }
5600
5601         FreeConfigVariables(head);
5602 }
5603
5604 /*
5605  * Exit archive-recovery state
5606  */
5607 static void
5608 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
5609 {
5610         char            recoveryPath[MAXPGPATH];
5611         char            xlogpath[MAXPGPATH];
5612
5613         /*
5614          * We are no longer in archive recovery state.
5615          */
5616         InArchiveRecovery = false;
5617
5618         /*
5619          * Update min recovery point one last time.
5620          */
5621         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5622
5623         /*
5624          * If the ending log segment is still open, close it (to avoid problems on
5625          * Windows with trying to rename or delete an open file).
5626          */
5627         if (readFile >= 0)
5628         {
5629                 close(readFile);
5630                 readFile = -1;
5631         }
5632
5633         /*
5634          * If we are establishing a new timeline, we have to copy data from the
5635          * last WAL segment of the old timeline to create a starting WAL segment
5636          * for the new timeline.
5637          *
5638          * Notify the archiver that the last WAL segment of the old timeline is
5639          * ready to copy to archival storage. Otherwise, it is not archived for a
5640          * while.
5641          */
5642         if (endTLI != ThisTimeLineID)
5643         {
5644                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
5645
5646                 if (XLogArchivingActive())
5647                 {
5648                         XLogFileName(xlogpath, endTLI, endLogSegNo);
5649                         XLogArchiveNotify(xlogpath);
5650                 }
5651         }
5652
5653         /*
5654          * Let's just make real sure there are not .ready or .done flags posted
5655          * for the new segment.
5656          */
5657         XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
5658         XLogArchiveCleanup(xlogpath);
5659
5660         /*
5661          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5662          * of it.
5663          */
5664         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5665         unlink(recoveryPath);           /* ignore any error */
5666
5667         /* Get rid of any remaining recovered timeline-history file, too */
5668         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5669         unlink(recoveryPath);           /* ignore any error */
5670
5671         /*
5672          * Rename the config file out of the way, so that we don't accidentally
5673          * re-enter archive recovery mode in a subsequent crash.
5674          */
5675         unlink(RECOVERY_COMMAND_DONE);
5676         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5677                 ereport(FATAL,
5678                                 (errcode_for_file_access(),
5679                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5680                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5681
5682         ereport(LOG,
5683                         (errmsg("archive recovery complete")));
5684 }
5685
5686 /*
5687  * Extract timestamp from WAL record.
5688  *
5689  * If the record contains a timestamp, returns true, and saves the timestamp
5690  * in *recordXtime. If the record type has no timestamp, returns false.
5691  * Currently, only transaction commit/abort records and restore points contain
5692  * timestamps.
5693  */
5694 static bool
5695 getRecordTimestamp(XLogRecord *record, TimestampTz *recordXtime)
5696 {
5697         uint8           record_info = record->xl_info & ~XLR_INFO_MASK;
5698
5699         if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5700         {
5701                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5702                 return true;
5703         }
5704         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5705         {
5706                 *recordXtime = ((xl_xact_commit_compact *) XLogRecGetData(record))->xact_time;
5707                 return true;
5708         }
5709         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5710         {
5711                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5712                 return true;
5713         }
5714         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5715         {
5716                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5717                 return true;
5718         }
5719         return false;
5720 }
5721
5722 /*
5723  * For point-in-time recovery, this function decides whether we want to
5724  * stop applying the XLOG before the current record.
5725  *
5726  * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5727  * information is saved in recoveryStopXid et al for use in annotating the
5728  * new timeline's history file.
5729  */
5730 static bool
5731 recoveryStopsBefore(XLogRecord *record)
5732 {
5733         bool            stopsHere = false;
5734         uint8           record_info;
5735         bool            isCommit;
5736         TimestampTz recordXtime = 0;
5737
5738         /* Check if we should stop as soon as reaching consistency */
5739         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5740         {
5741                 ereport(LOG,
5742                                 (errmsg("recovery stopping after reaching consistency")));
5743
5744                 recoveryStopAfter = false;
5745                 recoveryStopXid = InvalidTransactionId;
5746                 recoveryStopTime = 0;
5747                 recoveryStopName[0] = '\0';
5748                 return true;
5749         }
5750
5751         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5752         if (record->xl_rmid != RM_XACT_ID)
5753                 return false;
5754         record_info = record->xl_info & ~XLR_INFO_MASK;
5755         if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5756                 isCommit = true;
5757         else if (record_info == XLOG_XACT_ABORT)
5758                 isCommit = false;
5759         else
5760                 return false;
5761
5762         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5763         {
5764                 /*
5765                  * There can be only one transaction end record with this exact
5766                  * transactionid
5767                  *
5768                  * when testing for an xid, we MUST test for equality only, since
5769                  * transactions are numbered in the order they start, not the order
5770                  * they complete. A higher numbered xid will complete before you
5771                  * about 50% of the time...
5772                  */
5773                 stopsHere = (record->xl_xid == recoveryTargetXid);
5774         }
5775
5776         if (recoveryTarget == RECOVERY_TARGET_TIME &&
5777                 getRecordTimestamp(record, &recordXtime))
5778         {
5779                 /*
5780                  * There can be many transactions that share the same commit time, so
5781                  * we stop after the last one, if we are inclusive, or stop at the
5782                  * first one if we are exclusive
5783                  */
5784                 if (recoveryTargetInclusive)
5785                         stopsHere = (recordXtime > recoveryTargetTime);
5786                 else
5787                         stopsHere = (recordXtime >= recoveryTargetTime);
5788         }
5789
5790         if (stopsHere)
5791         {
5792                 recoveryStopAfter = false;
5793                 recoveryStopXid = record->xl_xid;
5794                 recoveryStopTime = recordXtime;
5795                 recoveryStopName[0] = '\0';
5796
5797                 if (isCommit)
5798                 {
5799                         ereport(LOG,
5800                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
5801                                                         recoveryStopXid,
5802                                                         timestamptz_to_str(recoveryStopTime))));
5803                 }
5804                 else
5805                 {
5806                         ereport(LOG,
5807                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
5808                                                         recoveryStopXid,
5809                                                         timestamptz_to_str(recoveryStopTime))));
5810                 }
5811         }
5812
5813         return stopsHere;
5814 }
5815
5816 /*
5817  * Same as recoveryStopsBefore, but called after applying the record.
5818  *
5819  * We also track the timestamp of the latest applied COMMIT/ABORT
5820  * record in XLogCtl->recoveryLastXTime.
5821  */
5822 static bool
5823 recoveryStopsAfter(XLogRecord *record)
5824 {
5825         uint8           record_info;
5826         TimestampTz recordXtime;
5827
5828         record_info = record->xl_info & ~XLR_INFO_MASK;
5829
5830         /*
5831          * There can be many restore points that share the same name; we stop
5832          * at the first one.
5833          */
5834         if (recoveryTarget == RECOVERY_TARGET_NAME &&
5835                 record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5836         {
5837                 xl_restore_point *recordRestorePointData;
5838
5839                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5840
5841                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5842                 {
5843                         recoveryStopAfter = true;
5844                         recoveryStopXid = InvalidTransactionId;
5845                         (void) getRecordTimestamp(record, &recoveryStopTime);
5846                         strncpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5847
5848                         ereport(LOG,
5849                                         (errmsg("recovery stopping at restore point \"%s\", time %s",
5850                                                         recoveryStopName,
5851                                                         timestamptz_to_str(recoveryStopTime))));
5852                         return true;
5853                 }
5854         }
5855
5856         if (record->xl_rmid == RM_XACT_ID &&
5857                 (record_info == XLOG_XACT_COMMIT_COMPACT ||
5858                  record_info == XLOG_XACT_COMMIT ||
5859                  record_info == XLOG_XACT_ABORT))
5860         {
5861                 /* Update the last applied transaction timestamp */
5862                 if (getRecordTimestamp(record, &recordXtime))
5863                         SetLatestXTime(recordXtime);
5864
5865                 /*
5866                  * There can be only one transaction end record with this exact
5867                  * transactionid
5868                  *
5869                  * when testing for an xid, we MUST test for equality only, since
5870                  * transactions are numbered in the order they start, not the order
5871                  * they complete. A higher numbered xid will complete before you about
5872                  * 50% of the time...
5873                  */
5874                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5875                         record->xl_xid == recoveryTargetXid)
5876                 {
5877                         recoveryStopAfter = true;
5878                         recoveryStopXid = record->xl_xid;
5879                         recoveryStopTime = recordXtime;
5880                         recoveryStopName[0] = '\0';
5881
5882                         if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5883                         {
5884                                 ereport(LOG,
5885                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5886                                                                 recoveryStopXid,
5887                                                                 timestamptz_to_str(recoveryStopTime))));
5888                         }
5889                         else if (record_info == XLOG_XACT_ABORT)
5890                         {
5891                                 ereport(LOG,
5892                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5893                                                                 recoveryStopXid,
5894                                                                 timestamptz_to_str(recoveryStopTime))));
5895                         }
5896                         return true;
5897                 }
5898         }
5899
5900         /* Check if we should stop as soon as reaching consistency */
5901         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5902         {
5903                 ereport(LOG,
5904                                 (errmsg("recovery stopping after reaching consistency")));
5905
5906                 recoveryStopAfter = true;
5907                 recoveryStopXid = InvalidTransactionId;
5908                 recoveryStopTime = 0;
5909                 recoveryStopName[0] = '\0';
5910                 return true;
5911         }
5912
5913         return false;
5914 }
5915
5916 /*
5917  * Wait until shared recoveryPause flag is cleared.
5918  *
5919  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5920  * Probably not worth the trouble though.  This state shouldn't be one that
5921  * anyone cares about server power consumption in.
5922  */
5923 static void
5924 recoveryPausesHere(void)
5925 {
5926         /* Don't pause unless users can connect! */
5927         if (!LocalHotStandbyActive)
5928                 return;
5929
5930         ereport(LOG,
5931                         (errmsg("recovery has paused"),
5932                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5933
5934         while (RecoveryIsPaused())
5935         {
5936                 pg_usleep(1000000L);    /* 1000 ms */
5937                 HandleStartupProcInterrupts();
5938         }
5939 }
5940
5941 bool
5942 RecoveryIsPaused(void)
5943 {
5944         /* use volatile pointer to prevent code rearrangement */
5945         volatile XLogCtlData *xlogctl = XLogCtl;
5946         bool            recoveryPause;
5947
5948         SpinLockAcquire(&xlogctl->info_lck);
5949         recoveryPause = xlogctl->recoveryPause;
5950         SpinLockRelease(&xlogctl->info_lck);
5951
5952         return recoveryPause;
5953 }
5954
5955 void
5956 SetRecoveryPause(bool recoveryPause)
5957 {
5958         /* use volatile pointer to prevent code rearrangement */
5959         volatile XLogCtlData *xlogctl = XLogCtl;
5960
5961         SpinLockAcquire(&xlogctl->info_lck);
5962         xlogctl->recoveryPause = recoveryPause;
5963         SpinLockRelease(&xlogctl->info_lck);
5964 }
5965
5966 /*
5967  * When min_recovery_apply_delay is set, we wait long enough to make sure
5968  * certain record types are applied at least that interval behind the master.
5969  *
5970  * Returns true if we waited.
5971  *
5972  * Note that the delay is calculated between the WAL record log time and
5973  * the current time on standby. We would prefer to keep track of when this
5974  * standby received each WAL record, which would allow a more consistent
5975  * approach and one not affected by time synchronisation issues, but that
5976  * is significantly more effort and complexity for little actual gain in
5977  * usability.
5978  */
5979 static bool
5980 recoveryApplyDelay(XLogRecord *record)
5981 {
5982         uint8           record_info;
5983         TimestampTz xtime;
5984         long            secs;
5985         int                     microsecs;
5986
5987         /* nothing to do if no delay configured */
5988         if (min_recovery_apply_delay == 0)
5989                 return false;
5990
5991         /*
5992          * Is it a COMMIT record?
5993          *
5994          * We deliberately choose not to delay aborts since they have no effect
5995          * on MVCC. We already allow replay of records that don't have a
5996          * timestamp, so there is already opportunity for issues caused by early
5997          * conflicts on standbys.
5998          */
5999         record_info = record->xl_info & ~XLR_INFO_MASK;
6000         if (!(record->xl_rmid == RM_XACT_ID &&
6001                   (record_info == XLOG_XACT_COMMIT_COMPACT ||
6002                    record_info == XLOG_XACT_COMMIT)))
6003                 return false;
6004
6005         if (!getRecordTimestamp(record, &xtime))
6006                 return false;
6007
6008         recoveryDelayUntilTime =
6009                 TimestampTzPlusMilliseconds(xtime, min_recovery_apply_delay);
6010
6011         /*
6012          * Exit without arming the latch if it's already past time to apply this
6013          * record
6014          */
6015         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6016                                                 &secs, &microsecs);
6017         if (secs <= 0 && microsecs <=0)
6018                 return false;
6019
6020         while (true)
6021         {
6022                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6023
6024                 /* might change the trigger file's location */
6025                 HandleStartupProcInterrupts();
6026
6027                 if (CheckForStandbyTrigger())
6028                         break;
6029
6030                 /*
6031                  * Wait for difference between GetCurrentTimestamp() and
6032                  * recoveryDelayUntilTime
6033                  */
6034                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6035                                                         &secs, &microsecs);
6036
6037                 if (secs <= 0 && microsecs <=0)
6038                         break;
6039
6040                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
6041                         secs, microsecs / 1000);
6042
6043                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
6044                                         WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
6045                                         secs * 1000L + microsecs / 1000);
6046         }
6047         return true;
6048 }
6049
6050 /*
6051  * Save timestamp of latest processed commit/abort record.
6052  *
6053  * We keep this in XLogCtl, not a simple static variable, so that it can be
6054  * seen by processes other than the startup process.  Note in particular
6055  * that CreateRestartPoint is executed in the checkpointer.
6056  */
6057 static void
6058 SetLatestXTime(TimestampTz xtime)
6059 {
6060         /* use volatile pointer to prevent code rearrangement */
6061         volatile XLogCtlData *xlogctl = XLogCtl;
6062
6063         SpinLockAcquire(&xlogctl->info_lck);
6064         xlogctl->recoveryLastXTime = xtime;
6065         SpinLockRelease(&xlogctl->info_lck);
6066 }
6067
6068 /*
6069  * Fetch timestamp of latest processed commit/abort record.
6070  */
6071 TimestampTz
6072 GetLatestXTime(void)
6073 {
6074         /* use volatile pointer to prevent code rearrangement */
6075         volatile XLogCtlData *xlogctl = XLogCtl;
6076         TimestampTz xtime;
6077
6078         SpinLockAcquire(&xlogctl->info_lck);
6079         xtime = xlogctl->recoveryLastXTime;
6080         SpinLockRelease(&xlogctl->info_lck);
6081
6082         return xtime;
6083 }
6084
6085 /*
6086  * Save timestamp of the next chunk of WAL records to apply.
6087  *
6088  * We keep this in XLogCtl, not a simple static variable, so that it can be
6089  * seen by all backends.
6090  */
6091 static void
6092 SetCurrentChunkStartTime(TimestampTz xtime)
6093 {
6094         /* use volatile pointer to prevent code rearrangement */
6095         volatile XLogCtlData *xlogctl = XLogCtl;
6096
6097         SpinLockAcquire(&xlogctl->info_lck);
6098         xlogctl->currentChunkStartTime = xtime;
6099         SpinLockRelease(&xlogctl->info_lck);
6100 }
6101
6102 /*
6103  * Fetch timestamp of latest processed commit/abort record.
6104  * Startup process maintains an accurate local copy in XLogReceiptTime
6105  */
6106 TimestampTz
6107 GetCurrentChunkReplayStartTime(void)
6108 {
6109         /* use volatile pointer to prevent code rearrangement */
6110         volatile XLogCtlData *xlogctl = XLogCtl;
6111         TimestampTz xtime;
6112
6113         SpinLockAcquire(&xlogctl->info_lck);
6114         xtime = xlogctl->currentChunkStartTime;
6115         SpinLockRelease(&xlogctl->info_lck);
6116
6117         return xtime;
6118 }
6119
6120 /*
6121  * Returns time of receipt of current chunk of XLOG data, as well as
6122  * whether it was received from streaming replication or from archives.
6123  */
6124 void
6125 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6126 {
6127         /*
6128          * This must be executed in the startup process, since we don't export the
6129          * relevant state to shared memory.
6130          */
6131         Assert(InRecovery);
6132
6133         *rtime = XLogReceiptTime;
6134         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6135 }
6136
6137 /*
6138  * Note that text field supplied is a parameter name and does not require
6139  * translation
6140  */
6141 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6142 do { \
6143         if ((currValue) < (minValue)) \
6144                 ereport(ERROR, \
6145                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6146                                  errmsg("hot standby is not possible because " \
6147                                                 "%s = %d is a lower setting than on the master server " \
6148                                                 "(its value was %d)", \
6149                                                 param_name, \
6150                                                 currValue, \
6151                                                 minValue))); \
6152 } while(0)
6153
6154 /*
6155  * Check to see if required parameters are set high enough on this server
6156  * for various aspects of recovery operation.
6157  */
6158 static void
6159 CheckRequiredParameterValues(void)
6160 {
6161         /*
6162          * For archive recovery, the WAL must be generated with at least 'archive'
6163          * wal_level.
6164          */
6165         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6166         {
6167                 ereport(WARNING,
6168                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6169                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6170         }
6171
6172         /*
6173          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
6174          * we must have at least as many backend slots as the primary.
6175          */
6176         if (InArchiveRecovery && EnableHotStandby)
6177         {
6178                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
6179                         ereport(ERROR,
6180                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on the master server"),
6181                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
6182
6183                 /* We ignore autovacuum_max_workers when we make this test. */
6184                 RecoveryRequiresIntParameter("max_connections",
6185                                                                          MaxConnections,
6186                                                                          ControlFile->MaxConnections);
6187                 RecoveryRequiresIntParameter("max_worker_processes",
6188                                                                          max_worker_processes,
6189                                                                          ControlFile->max_worker_processes);
6190                 RecoveryRequiresIntParameter("max_prepared_transactions",
6191                                                                          max_prepared_xacts,
6192                                                                          ControlFile->max_prepared_xacts);
6193                 RecoveryRequiresIntParameter("max_locks_per_transaction",
6194                                                                          max_locks_per_xact,
6195                                                                          ControlFile->max_locks_per_xact);
6196         }
6197 }
6198
6199 /*
6200  * This must be called ONCE during postmaster or standalone-backend startup
6201  */
6202 void
6203 StartupXLOG(void)
6204 {
6205         XLogCtlInsert *Insert;
6206         CheckPoint      checkPoint;
6207         bool            wasShutdown;
6208         bool            reachedStopPoint = false;
6209         bool            haveBackupLabel = false;
6210         XLogRecPtr      RecPtr,
6211                                 checkPointLoc,
6212                                 EndOfLog;
6213         XLogSegNo       endLogSegNo;
6214         TimeLineID      PrevTimeLineID;
6215         XLogRecord *record;
6216         TransactionId oldestActiveXID;
6217         bool            backupEndRequired = false;
6218         bool            backupFromStandby = false;
6219         DBState         dbstate_at_startup;
6220         XLogReaderState *xlogreader;
6221         XLogPageReadPrivate private;
6222         bool            fast_promoted = false;
6223
6224         /*
6225          * Read control file and check XLOG status looks valid.
6226          *
6227          * Note: in most control paths, *ControlFile is already valid and we need
6228          * not do ReadControlFile() here, but might as well do it to be sure.
6229          */
6230         ReadControlFile();
6231
6232         if (ControlFile->state < DB_SHUTDOWNED ||
6233                 ControlFile->state > DB_IN_PRODUCTION ||
6234                 !XRecOffIsValid(ControlFile->checkPoint))
6235                 ereport(FATAL,
6236                                 (errmsg("control file contains invalid data")));
6237
6238         if (ControlFile->state == DB_SHUTDOWNED)
6239         {
6240                 /* This is the expected case, so don't be chatty in standalone mode */
6241                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6242                                 (errmsg("database system was shut down at %s",
6243                                                 str_time(ControlFile->time))));
6244         }
6245         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6246                 ereport(LOG,
6247                                 (errmsg("database system was shut down in recovery at %s",
6248                                                 str_time(ControlFile->time))));
6249         else if (ControlFile->state == DB_SHUTDOWNING)
6250                 ereport(LOG,
6251                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6252                                                 str_time(ControlFile->time))));
6253         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6254                 ereport(LOG,
6255                    (errmsg("database system was interrupted while in recovery at %s",
6256                                    str_time(ControlFile->time)),
6257                         errhint("This probably means that some data is corrupted and"
6258                                         " you will have to use the last backup for recovery.")));
6259         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6260                 ereport(LOG,
6261                                 (errmsg("database system was interrupted while in recovery at log time %s",
6262                                                 str_time(ControlFile->checkPointCopy.time)),
6263                                  errhint("If this has occurred more than once some data might be corrupted"
6264                           " and you might need to choose an earlier recovery target.")));
6265         else if (ControlFile->state == DB_IN_PRODUCTION)
6266                 ereport(LOG,
6267                           (errmsg("database system was interrupted; last known up at %s",
6268                                           str_time(ControlFile->time))));
6269
6270         /* This is just to allow attaching to startup process with a debugger */
6271 #ifdef XLOG_REPLAY_DELAY
6272         if (ControlFile->state != DB_SHUTDOWNED)
6273                 pg_usleep(60000000L);
6274 #endif
6275
6276         /*
6277          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
6278          * someone has performed a copy for PITR, these directories may have been
6279          * excluded and need to be re-created.
6280          */
6281         ValidateXLOGDirectoryStructure();
6282
6283         /*
6284          * Clear out any old relcache cache files.      This is *necessary* if we do
6285          * any WAL replay, since that would probably result in the cache files
6286          * being out of sync with database reality.  In theory we could leave them
6287          * in place if the database had been cleanly shut down, but it seems
6288          * safest to just remove them always and let them be rebuilt during the
6289          * first backend startup.
6290          */
6291         RelationCacheInitFileRemove();
6292
6293         /*
6294          * Initialize on the assumption we want to recover to the latest timeline
6295          * that's active according to pg_control.
6296          */
6297         if (ControlFile->minRecoveryPointTLI >
6298                 ControlFile->checkPointCopy.ThisTimeLineID)
6299                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6300         else
6301                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6302
6303         /*
6304          * Check for recovery control file, and if so set up state for offline
6305          * recovery
6306          */
6307         readRecoveryCommandFile();
6308
6309         /*
6310          * Save archive_cleanup_command in shared memory so that other processes
6311          * can see it.
6312          */
6313         strncpy(XLogCtl->archiveCleanupCommand,
6314                         archiveCleanupCommand ? archiveCleanupCommand : "",
6315                         sizeof(XLogCtl->archiveCleanupCommand));
6316
6317         if (ArchiveRecoveryRequested)
6318         {
6319                 if (StandbyModeRequested)
6320                         ereport(LOG,
6321                                         (errmsg("entering standby mode")));
6322                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6323                         ereport(LOG,
6324                                         (errmsg("starting point-in-time recovery to XID %u",
6325                                                         recoveryTargetXid)));
6326                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6327                         ereport(LOG,
6328                                         (errmsg("starting point-in-time recovery to %s",
6329                                                         timestamptz_to_str(recoveryTargetTime))));
6330                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6331                         ereport(LOG,
6332                                         (errmsg("starting point-in-time recovery to \"%s\"",
6333                                                         recoveryTargetName)));
6334                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6335                         ereport(LOG,
6336                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
6337                 else
6338                         ereport(LOG,
6339                                         (errmsg("starting archive recovery")));
6340         }
6341
6342         /*
6343          * Take ownership of the wakeup latch if we're going to sleep during
6344          * recovery.
6345          */
6346         if (StandbyModeRequested)
6347                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6348
6349         /* Set up XLOG reader facility */
6350         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6351         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6352         if (!xlogreader)
6353                 ereport(ERROR,
6354                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6355                                  errmsg("out of memory"),
6356                         errdetail("Failed while allocating an XLog reading processor.")));
6357         xlogreader->system_identifier = ControlFile->system_identifier;
6358
6359         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6360                                                   &backupFromStandby))
6361         {
6362                 /*
6363                  * Archive recovery was requested, and thanks to the backup label
6364                  * file, we know how far we need to replay to reach consistency. Enter
6365                  * archive recovery directly.
6366                  */
6367                 InArchiveRecovery = true;
6368                 if (StandbyModeRequested)
6369                         StandbyMode = true;
6370
6371                 /*
6372                  * When a backup_label file is present, we want to roll forward from
6373                  * the checkpoint it identifies, rather than using pg_control.
6374                  */
6375                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6376                 if (record != NULL)
6377                 {
6378                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6379                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6380                         ereport(DEBUG1,
6381                                         (errmsg("checkpoint record is at %X/%X",
6382                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6383                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6384
6385                         /*
6386                          * Make sure that REDO location exists. This may not be the case
6387                          * if there was a crash during an online backup, which left a
6388                          * backup_label around that references a WAL segment that's
6389                          * already been archived.
6390                          */
6391                         if (checkPoint.redo < checkPointLoc)
6392                         {
6393                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6394                                         ereport(FATAL,
6395                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6396                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6397                         }
6398                 }
6399                 else
6400                 {
6401                         ereport(FATAL,
6402                                         (errmsg("could not locate required checkpoint record"),
6403                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6404                         wasShutdown = false;    /* keep compiler quiet */
6405                 }
6406                 /* set flag to delete it later */
6407                 haveBackupLabel = true;
6408         }
6409         else
6410         {
6411                 /*
6412                  * It's possible that archive recovery was requested, but we don't
6413                  * know how far we need to replay the WAL before we reach consistency.
6414                  * This can happen for example if a base backup is taken from a
6415                  * running server using an atomic filesystem snapshot, without calling
6416                  * pg_start/stop_backup. Or if you just kill a running master server
6417                  * and put it into archive recovery by creating a recovery.conf file.
6418                  *
6419                  * Our strategy in that case is to perform crash recovery first,
6420                  * replaying all the WAL present in pg_xlog, and only enter archive
6421                  * recovery after that.
6422                  *
6423                  * But usually we already know how far we need to replay the WAL (up
6424                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6425                  * end-of-backup record), and we can enter archive recovery directly.
6426                  */
6427                 if (ArchiveRecoveryRequested &&
6428                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6429                          ControlFile->backupEndRequired ||
6430                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6431                          ControlFile->state == DB_SHUTDOWNED))
6432                 {
6433                         InArchiveRecovery = true;
6434                         if (StandbyModeRequested)
6435                                 StandbyMode = true;
6436                 }
6437
6438                 /*
6439                  * Get the last valid checkpoint record.  If the latest one according
6440                  * to pg_control is broken, try the next-to-last one.
6441                  */
6442                 checkPointLoc = ControlFile->checkPoint;
6443                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6444                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6445                 if (record != NULL)
6446                 {
6447                         ereport(DEBUG1,
6448                                         (errmsg("checkpoint record is at %X/%X",
6449                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6450                 }
6451                 else if (StandbyMode)
6452                 {
6453                         /*
6454                          * The last valid checkpoint record required for a streaming
6455                          * recovery exists in neither standby nor the primary.
6456                          */
6457                         ereport(PANIC,
6458                                         (errmsg("could not locate a valid checkpoint record")));
6459                 }
6460                 else
6461                 {
6462                         checkPointLoc = ControlFile->prevCheckPoint;
6463                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6464                         if (record != NULL)
6465                         {
6466                                 ereport(LOG,
6467                                                 (errmsg("using previous checkpoint record at %X/%X",
6468                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6469                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6470                         }
6471                         else
6472                                 ereport(PANIC,
6473                                          (errmsg("could not locate a valid checkpoint record")));
6474                 }
6475                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6476                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6477         }
6478
6479         /*
6480          * If the location of the checkpoint record is not on the expected
6481          * timeline in the history of the requested timeline, we cannot proceed:
6482          * the backup is not part of the history of the requested timeline.
6483          */
6484         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6485                                                                  * record */
6486         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6487                 checkPoint.ThisTimeLineID)
6488         {
6489                 XLogRecPtr      switchpoint;
6490
6491                 /*
6492                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6493                  * not in expectedTLEs at all.
6494                  */
6495                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6496                 ereport(FATAL,
6497                                 (errmsg("requested timeline %u is not a child of this server's history",
6498                                                 recoveryTargetTLI),
6499                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6500                                                    (uint32) (ControlFile->checkPoint >> 32),
6501                                                    (uint32) ControlFile->checkPoint,
6502                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6503                                                    (uint32) (switchpoint >> 32),
6504                                                    (uint32) switchpoint)));
6505         }
6506
6507         /*
6508          * The min recovery point should be part of the requested timeline's
6509          * history, too.
6510          */
6511         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6512           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6513                 ControlFile->minRecoveryPointTLI)
6514                 ereport(FATAL,
6515                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6516                                                 recoveryTargetTLI,
6517                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6518                                                 (uint32) ControlFile->minRecoveryPoint,
6519                                                 ControlFile->minRecoveryPointTLI)));
6520
6521         LastRec = RecPtr = checkPointLoc;
6522
6523         ereport(DEBUG1,
6524                         (errmsg("redo record is at %X/%X; shutdown %s",
6525                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6526                                         wasShutdown ? "TRUE" : "FALSE")));
6527         ereport(DEBUG1,
6528                         (errmsg("next transaction ID: %u/%u; next OID: %u",
6529                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
6530                                         checkPoint.nextOid)));
6531         ereport(DEBUG1,
6532                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6533                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6534         ereport(DEBUG1,
6535                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6536                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
6537         ereport(DEBUG1,
6538                         (errmsg("oldest MultiXactId: %u, in database %u",
6539                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6540         if (!TransactionIdIsNormal(checkPoint.nextXid))
6541                 ereport(PANIC,
6542                                 (errmsg("invalid next transaction ID")));
6543
6544         /* initialize shared memory variables from the checkpoint record */
6545         ShmemVariableCache->nextXid = checkPoint.nextXid;
6546         ShmemVariableCache->nextOid = checkPoint.nextOid;
6547         ShmemVariableCache->oidCount = 0;
6548         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6549         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6550         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
6551         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6552         XLogCtl->ckptXid = checkPoint.nextXid;
6553
6554         /*
6555          * Initialize replication slots, before there's a chance to remove
6556          * required resources.
6557          */
6558         StartupReplicationSlots(checkPoint.redo);
6559
6560         /*
6561          * Startup MultiXact.  We need to do this early for two reasons: one
6562          * is that we might try to access multixacts when we do tuple freezing,
6563          * and the other is we need its state initialized because we attempt
6564          * truncation during restartpoints.
6565          */
6566         StartupMultiXact();
6567
6568         /*
6569          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6570          * control file. On recovery, all unlogged relations are blown away, so
6571          * the unlogged LSN counter can be reset too.
6572          */
6573         if (ControlFile->state == DB_SHUTDOWNED)
6574                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6575         else
6576                 XLogCtl->unloggedLSN = 1;
6577
6578         /*
6579          * We must replay WAL entries using the same TimeLineID they were created
6580          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6581          * also xlog_redo()).
6582          */
6583         ThisTimeLineID = checkPoint.ThisTimeLineID;
6584
6585         /*
6586          * Copy any missing timeline history files between 'now' and the recovery
6587          * target timeline from archive to pg_xlog. While we don't need those
6588          * files ourselves - the history file of the recovery target timeline
6589          * covers all the previous timelines in the history too - a cascading
6590          * standby server might be interested in them. Or, if you archive the WAL
6591          * from this server to a different archive than the master, it'd be good
6592          * for all the history files to get archived there after failover, so that
6593          * you can use one of the old timelines as a PITR target. Timeline history
6594          * files are small, so it's better to copy them unnecessarily than not
6595          * copy them and regret later.
6596          */
6597         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6598
6599         lastFullPageWrites = checkPoint.fullPageWrites;
6600
6601         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6602
6603         if (RecPtr < checkPoint.redo)
6604                 ereport(PANIC,
6605                                 (errmsg("invalid redo in checkpoint record")));
6606
6607         /*
6608          * Check whether we need to force recovery from WAL.  If it appears to
6609          * have been a clean shutdown and we did not have a recovery.conf file,
6610          * then assume no recovery needed.
6611          */
6612         if (checkPoint.redo < RecPtr)
6613         {
6614                 if (wasShutdown)
6615                         ereport(PANIC,
6616                                         (errmsg("invalid redo record in shutdown checkpoint")));
6617                 InRecovery = true;
6618         }
6619         else if (ControlFile->state != DB_SHUTDOWNED)
6620                 InRecovery = true;
6621         else if (ArchiveRecoveryRequested)
6622         {
6623                 /* force recovery due to presence of recovery.conf */
6624                 InRecovery = true;
6625         }
6626
6627         /* REDO */
6628         if (InRecovery)
6629         {
6630                 int                     rmid;
6631
6632                 /* use volatile pointer to prevent code rearrangement */
6633                 volatile XLogCtlData *xlogctl = XLogCtl;
6634
6635                 /*
6636                  * Update pg_control to show that we are recovering and to show the
6637                  * selected checkpoint as the place we are starting from. We also mark
6638                  * pg_control with any minimum recovery stop point obtained from a
6639                  * backup history file.
6640                  */
6641                 dbstate_at_startup = ControlFile->state;
6642                 if (InArchiveRecovery)
6643                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6644                 else
6645                 {
6646                         ereport(LOG,
6647                                         (errmsg("database system was not properly shut down; "
6648                                                         "automatic recovery in progress")));
6649                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6650                                 ereport(LOG,
6651                                                 (errmsg("crash recovery starts in timeline %u "
6652                                                                 "and has target timeline %u",
6653                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6654                                                                 recoveryTargetTLI)));
6655                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6656                 }
6657                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6658                 ControlFile->checkPoint = checkPointLoc;
6659                 ControlFile->checkPointCopy = checkPoint;
6660                 if (InArchiveRecovery)
6661                 {
6662                         /* initialize minRecoveryPoint if not set yet */
6663                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6664                         {
6665                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6666                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6667                         }
6668                 }
6669
6670                 /*
6671                  * Set backupStartPoint if we're starting recovery from a base backup.
6672                  *
6673                  * Set backupEndPoint and use minRecoveryPoint as the backup end
6674                  * location if we're starting recovery from a base backup which was
6675                  * taken from the standby. In this case, the database system status in
6676                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
6677                  * means that backup is corrupted, so we cancel recovery.
6678                  */
6679                 if (haveBackupLabel)
6680                 {
6681                         ControlFile->backupStartPoint = checkPoint.redo;
6682                         ControlFile->backupEndRequired = backupEndRequired;
6683
6684                         if (backupFromStandby)
6685                         {
6686                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6687                                         ereport(FATAL,
6688                                                         (errmsg("backup_label contains data inconsistent with control file"),
6689                                                          errhint("This means that the backup is corrupted and you will "
6690                                                            "have to use another backup for recovery.")));
6691                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6692                         }
6693                 }
6694                 ControlFile->time = (pg_time_t) time(NULL);
6695                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6696                 UpdateControlFile();
6697
6698                 /* initialize our local copy of minRecoveryPoint */
6699                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6700                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6701
6702                 /*
6703                  * Reset pgstat data, because it may be invalid after recovery.
6704                  */
6705                 pgstat_reset_all();
6706
6707                 /*
6708                  * If there was a backup label file, it's done its job and the info
6709                  * has now been propagated into pg_control.  We must get rid of the
6710                  * label file so that if we crash during recovery, we'll pick up at
6711                  * the latest recovery restartpoint instead of going all the way back
6712                  * to the backup start point.  It seems prudent though to just rename
6713                  * the file out of the way rather than delete it completely.
6714                  */
6715                 if (haveBackupLabel)
6716                 {
6717                         unlink(BACKUP_LABEL_OLD);
6718                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6719                                 ereport(FATAL,
6720                                                 (errcode_for_file_access(),
6721                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6722                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6723                 }
6724
6725                 /* Check that the GUCs used to generate the WAL allow recovery */
6726                 CheckRequiredParameterValues();
6727
6728                 /*
6729                  * We're in recovery, so unlogged relations may be trashed and must be
6730                  * reset.  This should be done BEFORE allowing Hot Standby
6731                  * connections, so that read-only backends don't try to read whatever
6732                  * garbage is left over from before.
6733                  */
6734                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6735
6736                 /*
6737                  * Likewise, delete any saved transaction snapshot files that got left
6738                  * behind by crashed backends.
6739                  */
6740                 DeleteAllExportedSnapshotFiles();
6741
6742                 /*
6743                  * Initialize for Hot Standby, if enabled. We won't let backends in
6744                  * yet, not until we've reached the min recovery point specified in
6745                  * control file and we've established a recovery snapshot from a
6746                  * running-xacts WAL record.
6747                  */
6748                 if (ArchiveRecoveryRequested && EnableHotStandby)
6749                 {
6750                         TransactionId *xids;
6751                         int                     nxids;
6752
6753                         ereport(DEBUG1,
6754                                         (errmsg("initializing for hot standby")));
6755
6756                         InitRecoveryTransactionEnvironment();
6757
6758                         if (wasShutdown)
6759                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6760                         else
6761                                 oldestActiveXID = checkPoint.oldestActiveXid;
6762                         Assert(TransactionIdIsValid(oldestActiveXID));
6763
6764                         /* Tell procarray about the range of xids it has to deal with */
6765                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6766
6767                         /*
6768                          * Startup commit log and subtrans only. MultiXact has already
6769                          * been started up and other SLRUs are not maintained during
6770                          * recovery and need not be started yet.
6771                          */
6772                         StartupCLOG();
6773                         StartupSUBTRANS(oldestActiveXID);
6774
6775                         /*
6776                          * If we're beginning at a shutdown checkpoint, we know that
6777                          * nothing was running on the master at this point. So fake-up an
6778                          * empty running-xacts record and use that here and now. Recover
6779                          * additional standby state for prepared transactions.
6780                          */
6781                         if (wasShutdown)
6782                         {
6783                                 RunningTransactionsData running;
6784                                 TransactionId latestCompletedXid;
6785
6786                                 /*
6787                                  * Construct a RunningTransactions snapshot representing a
6788                                  * shut down server, with only prepared transactions still
6789                                  * alive. We're never overflowed at this point because all
6790                                  * subxids are listed with their parent prepared transactions.
6791                                  */
6792                                 running.xcnt = nxids;
6793                                 running.subxcnt = 0;
6794                                 running.subxid_overflow = false;
6795                                 running.nextXid = checkPoint.nextXid;
6796                                 running.oldestRunningXid = oldestActiveXID;
6797                                 latestCompletedXid = checkPoint.nextXid;
6798                                 TransactionIdRetreat(latestCompletedXid);
6799                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6800                                 running.latestCompletedXid = latestCompletedXid;
6801                                 running.xids = xids;
6802
6803                                 ProcArrayApplyRecoveryInfo(&running);
6804
6805                                 StandbyRecoverPreparedTransactions(false);
6806                         }
6807                 }
6808
6809                 /* Initialize resource managers */
6810                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6811                 {
6812                         if (RmgrTable[rmid].rm_startup != NULL)
6813                                 RmgrTable[rmid].rm_startup();
6814                 }
6815
6816                 /*
6817                  * Initialize shared variables for tracking progress of WAL replay,
6818                  * as if we had just replayed the record before the REDO location.
6819                  */
6820                 SpinLockAcquire(&xlogctl->info_lck);
6821                 xlogctl->replayEndRecPtr = checkPoint.redo;
6822                 xlogctl->replayEndTLI = ThisTimeLineID;
6823                 xlogctl->lastReplayedEndRecPtr = checkPoint.redo;
6824                 xlogctl->lastReplayedTLI = ThisTimeLineID;
6825                 xlogctl->recoveryLastXTime = 0;
6826                 xlogctl->currentChunkStartTime = 0;
6827                 xlogctl->recoveryPause = false;
6828                 SpinLockRelease(&xlogctl->info_lck);
6829
6830                 /* Also ensure XLogReceiptTime has a sane value */
6831                 XLogReceiptTime = GetCurrentTimestamp();
6832
6833                 /*
6834                  * Let postmaster know we've started redo now, so that it can launch
6835                  * checkpointer to perform restartpoints.  We don't bother during
6836                  * crash recovery as restartpoints can only be performed during
6837                  * archive recovery.  And we'd like to keep crash recovery simple, to
6838                  * avoid introducing bugs that could affect you when recovering after
6839                  * crash.
6840                  *
6841                  * After this point, we can no longer assume that we're the only
6842                  * process in addition to postmaster!  Also, fsync requests are
6843                  * subsequently to be handled by the checkpointer, not locally.
6844                  */
6845                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6846                 {
6847                         PublishStartupProcessInformation();
6848                         SetForwardFsyncRequests();
6849                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6850                         bgwriterLaunched = true;
6851                 }
6852
6853                 /*
6854                  * Allow read-only connections immediately if we're consistent
6855                  * already.
6856                  */
6857                 CheckRecoveryConsistency();
6858
6859                 /*
6860                  * Find the first record that logically follows the checkpoint --- it
6861                  * might physically precede it, though.
6862                  */
6863                 if (checkPoint.redo < RecPtr)
6864                 {
6865                         /* back up to find the record */
6866                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6867                 }
6868                 else
6869                 {
6870                         /* just have to read next record after CheckPoint */
6871                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6872                 }
6873
6874                 if (record != NULL)
6875                 {
6876                         ErrorContextCallback errcallback;
6877                         TimestampTz xtime;
6878
6879                         InRedo = true;
6880
6881                         ereport(LOG,
6882                                         (errmsg("redo starts at %X/%X",
6883                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6884
6885                         /*
6886                          * main redo apply loop
6887                          */
6888                         do
6889                         {
6890                                 bool            switchedTLI = false;
6891
6892 #ifdef WAL_DEBUG
6893                                 if (XLOG_DEBUG ||
6894                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6895                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6896                                 {
6897                                         StringInfoData buf;
6898
6899                                         initStringInfo(&buf);
6900                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6901                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6902                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6903                                         xlog_outrec(&buf, record);
6904                                         appendStringInfoString(&buf, " - ");
6905                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
6906                                                                                                            record->xl_info,
6907                                                                                                          XLogRecGetData(record));
6908                                         elog(LOG, "%s", buf.data);
6909                                         pfree(buf.data);
6910                                 }
6911 #endif
6912
6913                                 /* Handle interrupt signals of startup process */
6914                                 HandleStartupProcInterrupts();
6915
6916                                 /*
6917                                  * Pause WAL replay, if requested by a hot-standby session via
6918                                  * SetRecoveryPause().
6919                                  *
6920                                  * Note that we intentionally don't take the info_lck spinlock
6921                                  * here.  We might therefore read a slightly stale value of
6922                                  * the recoveryPause flag, but it can't be very stale (no
6923                                  * worse than the last spinlock we did acquire).  Since a
6924                                  * pause request is a pretty asynchronous thing anyway,
6925                                  * possibly responding to it one WAL record later than we
6926                                  * otherwise would is a minor issue, so it doesn't seem worth
6927                                  * adding another spinlock cycle to prevent that.
6928                                  */
6929                                 if (xlogctl->recoveryPause)
6930                                         recoveryPausesHere();
6931
6932                                 /*
6933                                  * Have we reached our recovery target?
6934                                  */
6935                                 if (recoveryStopsBefore(record))
6936                                 {
6937                                         reachedStopPoint = true;        /* see below */
6938                                         break;
6939                                 }
6940
6941                                 /*
6942                                  * If we've been asked to lag the master, wait on
6943                                  * latch until enough time has passed.
6944                                  */
6945                                 if (recoveryApplyDelay(record))
6946                                 {
6947                                         /*
6948                                          * We test for paused recovery again here. If
6949                                          * user sets delayed apply, it may be because
6950                                          * they expect to pause recovery in case of
6951                                          * problems, so we must test again here otherwise
6952                                          * pausing during the delay-wait wouldn't work.
6953                                          */
6954                                         if (xlogctl->recoveryPause)
6955                                                 recoveryPausesHere();
6956                                 }
6957
6958                                 /* Setup error traceback support for ereport() */
6959                                 errcallback.callback = rm_redo_error_callback;
6960                                 errcallback.arg = (void *) record;
6961                                 errcallback.previous = error_context_stack;
6962                                 error_context_stack = &errcallback;
6963
6964                                 /*
6965                                  * ShmemVariableCache->nextXid must be beyond record's xid.
6966                                  *
6967                                  * We don't expect anyone else to modify nextXid, hence we
6968                                  * don't need to hold a lock while examining it.  We still
6969                                  * acquire the lock to modify it, though.
6970                                  */
6971                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6972                                                                                                  ShmemVariableCache->nextXid))
6973                                 {
6974                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6975                                         ShmemVariableCache->nextXid = record->xl_xid;
6976                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6977                                         LWLockRelease(XidGenLock);
6978                                 }
6979
6980                                 /*
6981                                  * Before replaying this record, check if this record causes
6982                                  * the current timeline to change. The record is already
6983                                  * considered to be part of the new timeline, so we update
6984                                  * ThisTimeLineID before replaying it. That's important so
6985                                  * that replayEndTLI, which is recorded as the minimum
6986                                  * recovery point's TLI if recovery stops after this record,
6987                                  * is set correctly.
6988                                  */
6989                                 if (record->xl_rmid == RM_XLOG_ID)
6990                                 {
6991                                         TimeLineID      newTLI = ThisTimeLineID;
6992                                         TimeLineID      prevTLI = ThisTimeLineID;
6993                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6994
6995                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
6996                                         {
6997                                                 CheckPoint      checkPoint;
6998
6999                                                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7000                                                 newTLI = checkPoint.ThisTimeLineID;
7001                                                 prevTLI = checkPoint.PrevTimeLineID;
7002                                         }
7003                                         else if (info == XLOG_END_OF_RECOVERY)
7004                                         {
7005                                                 xl_end_of_recovery xlrec;
7006
7007                                                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
7008                                                 newTLI = xlrec.ThisTimeLineID;
7009                                                 prevTLI = xlrec.PrevTimeLineID;
7010                                         }
7011
7012                                         if (newTLI != ThisTimeLineID)
7013                                         {
7014                                                 /* Check that it's OK to switch to this TLI */
7015                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7016
7017                                                 /* Following WAL records should be run with new TLI */
7018                                                 ThisTimeLineID = newTLI;
7019                                                 switchedTLI = true;
7020                                         }
7021                                 }
7022
7023                                 /*
7024                                  * Update shared replayEndRecPtr before replaying this record,
7025                                  * so that XLogFlush will update minRecoveryPoint correctly.
7026                                  */
7027                                 SpinLockAcquire(&xlogctl->info_lck);
7028                                 xlogctl->replayEndRecPtr = EndRecPtr;
7029                                 xlogctl->replayEndTLI = ThisTimeLineID;
7030                                 SpinLockRelease(&xlogctl->info_lck);
7031
7032                                 /*
7033                                  * If we are attempting to enter Hot Standby mode, process
7034                                  * XIDs we see
7035                                  */
7036                                 if (standbyState >= STANDBY_INITIALIZED &&
7037                                         TransactionIdIsValid(record->xl_xid))
7038                                         RecordKnownAssignedTransactionIds(record->xl_xid);
7039
7040                                 /* Now apply the WAL record itself */
7041                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
7042
7043                                 /* Pop the error context stack */
7044                                 error_context_stack = errcallback.previous;
7045
7046                                 /*
7047                                  * Update lastReplayedEndRecPtr after this record has been
7048                                  * successfully replayed.
7049                                  */
7050                                 SpinLockAcquire(&xlogctl->info_lck);
7051                                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
7052                                 xlogctl->lastReplayedTLI = ThisTimeLineID;
7053                                 SpinLockRelease(&xlogctl->info_lck);
7054
7055                                 /* Remember this record as the last-applied one */
7056                                 LastRec = ReadRecPtr;
7057
7058                                 /* Allow read-only connections if we're consistent now */
7059                                 CheckRecoveryConsistency();
7060
7061                                 /*
7062                                  * If this record was a timeline switch, wake up any
7063                                  * walsenders to notice that we are on a new timeline.
7064                                  */
7065                                 if (switchedTLI && AllowCascadeReplication())
7066                                         WalSndWakeup();
7067
7068                                 /* Exit loop if we reached inclusive recovery target */
7069                                 if (recoveryStopsAfter(record))
7070                                 {
7071                                         reachedStopPoint = true;
7072                                         break;
7073                                 }
7074
7075                                 /* Else, try to fetch the next WAL record */
7076                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7077                         } while (record != NULL);
7078
7079                         /*
7080                          * end of main redo apply loop
7081                          */
7082
7083                         if (recoveryPauseAtTarget && reachedStopPoint)
7084                         {
7085                                 SetRecoveryPause(true);
7086                                 recoveryPausesHere();
7087                         }
7088
7089                         ereport(LOG,
7090                                         (errmsg("redo done at %X/%X",
7091                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7092                         xtime = GetLatestXTime();
7093                         if (xtime)
7094                                 ereport(LOG,
7095                                          (errmsg("last completed transaction was at log time %s",
7096                                                          timestamptz_to_str(xtime))));
7097                         InRedo = false;
7098                 }
7099                 else
7100                 {
7101                         /* there are no WAL records following the checkpoint */
7102                         ereport(LOG,
7103                                         (errmsg("redo is not required")));
7104                 }
7105         }
7106
7107         /*
7108          * Kill WAL receiver, if it's still running, before we continue to write
7109          * the startup checkpoint record. It will trump over the checkpoint and
7110          * subsequent records if it's still alive when we start writing WAL.
7111          */
7112         ShutdownWalRcv();
7113
7114         /*
7115          * We don't need the latch anymore. It's not strictly necessary to disown
7116          * it, but let's do it for the sake of tidiness.
7117          */
7118         if (StandbyModeRequested)
7119                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7120
7121         /*
7122          * We are now done reading the xlog from stream. Turn off streaming
7123          * recovery to force fetching the files (which would be required at end of
7124          * recovery, e.g., timeline history file) from archive or pg_xlog.
7125          */
7126         StandbyMode = false;
7127
7128         /*
7129          * Re-fetch the last valid or last applied record, so we can identify the
7130          * exact endpoint of what we consider the valid portion of WAL.
7131          */
7132         record = ReadRecord(xlogreader, LastRec, PANIC, false);
7133         EndOfLog = EndRecPtr;
7134         XLByteToPrevSeg(EndOfLog, endLogSegNo);
7135
7136         /*
7137          * Complain if we did not roll forward far enough to render the backup
7138          * dump consistent.  Note: it is indeed okay to look at the local variable
7139          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7140          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7141          * advanced beyond the WAL we processed.
7142          */
7143         if (InRecovery &&
7144                 (EndOfLog < minRecoveryPoint ||
7145                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7146         {
7147                 if (reachedStopPoint)
7148                 {
7149                         /* stopped because of stop request */
7150                         ereport(FATAL,
7151                                         (errmsg("requested recovery stop point is before consistent recovery point")));
7152                 }
7153
7154                 /*
7155                  * Ran off end of WAL before reaching end-of-backup WAL record, or
7156                  * minRecoveryPoint. That's usually a bad sign, indicating that you
7157                  * tried to recover from an online backup but never called
7158                  * pg_stop_backup(), or you didn't archive all the WAL up to that
7159                  * point. However, this also happens in crash recovery, if the system
7160                  * crashes while an online backup is in progress. We must not treat
7161                  * that as an error, or the database will refuse to start up.
7162                  */
7163                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7164                 {
7165                         if (ControlFile->backupEndRequired)
7166                                 ereport(FATAL,
7167                                                 (errmsg("WAL ends before end of online backup"),
7168                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
7169                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7170                                 ereport(FATAL,
7171                                                 (errmsg("WAL ends before end of online backup"),
7172                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7173                         else
7174                                 ereport(FATAL,
7175                                           (errmsg("WAL ends before consistent recovery point")));
7176                 }
7177         }
7178
7179         /*
7180          * Consider whether we need to assign a new timeline ID.
7181          *
7182          * If we are doing an archive recovery, we always assign a new ID.      This
7183          * handles a couple of issues.  If we stopped short of the end of WAL
7184          * during recovery, then we are clearly generating a new timeline and must
7185          * assign it a unique new ID.  Even if we ran to the end, modifying the
7186          * current last segment is problematic because it may result in trying to
7187          * overwrite an already-archived copy of that segment, and we encourage
7188          * DBAs to make their archive_commands reject that.  We can dodge the
7189          * problem by making the new active segment have a new timeline ID.
7190          *
7191          * In a normal crash recovery, we can just extend the timeline we were in.
7192          */
7193         PrevTimeLineID = ThisTimeLineID;
7194         if (ArchiveRecoveryRequested)
7195         {
7196                 char            reason[200];
7197
7198                 Assert(InArchiveRecovery);
7199
7200                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7201                 ereport(LOG,
7202                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7203
7204                 /*
7205                  * Create a comment for the history file to explain why and where
7206                  * timeline changed.
7207                  */
7208                 if (recoveryTarget == RECOVERY_TARGET_XID)
7209                         snprintf(reason, sizeof(reason),
7210                                          "%s transaction %u",
7211                                          recoveryStopAfter ? "after" : "before",
7212                                          recoveryStopXid);
7213                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7214                         snprintf(reason, sizeof(reason),
7215                                          "%s %s\n",
7216                                          recoveryStopAfter ? "after" : "before",
7217                                          timestamptz_to_str(recoveryStopTime));
7218                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7219                         snprintf(reason, sizeof(reason),
7220                                          "at restore point \"%s\"",
7221                                          recoveryStopName);
7222                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7223                         snprintf(reason, sizeof(reason), "reached consistency");
7224                 else
7225                         snprintf(reason, sizeof(reason), "no recovery target specified");
7226
7227                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7228                                                          EndRecPtr, reason);
7229         }
7230
7231         /* Save the selected TimeLineID in shared memory, too */
7232         XLogCtl->ThisTimeLineID = ThisTimeLineID;
7233         XLogCtl->PrevTimeLineID = PrevTimeLineID;
7234
7235         /*
7236          * We are now done reading the old WAL.  Turn off archive fetching if it
7237          * was active, and make a writable copy of the last WAL segment. (Note
7238          * that we also have a copy of the last block of the old WAL in readBuf;
7239          * we will use that below.)
7240          */
7241         if (ArchiveRecoveryRequested)
7242                 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
7243
7244         /*
7245          * Prepare to write WAL starting at EndOfLog position, and init xlog
7246          * buffer cache using the block containing the last record from the
7247          * previous incarnation.
7248          */
7249         openLogSegNo = endLogSegNo;
7250         openLogFile = XLogFileOpen(openLogSegNo);
7251         openLogOff = 0;
7252         Insert = &XLogCtl->Insert;
7253         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7254         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7255
7256         /*
7257          * Tricky point here: readBuf contains the *last* block that the LastRec
7258          * record spans, not the one it starts in.      The last block is indeed the
7259          * one we want to use.
7260          */
7261         if (EndOfLog % XLOG_BLCKSZ != 0)
7262         {
7263                 char       *page;
7264                 int                     len;
7265                 int                     firstIdx;
7266                 XLogRecPtr      pageBeginPtr;
7267
7268                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7269                 Assert(readOff == pageBeginPtr % XLogSegSize);
7270
7271                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7272
7273                 /* Copy the valid part of the last block, and zero the rest */
7274                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7275                 len = EndOfLog % XLOG_BLCKSZ;
7276                 memcpy(page, xlogreader->readBuf, len);
7277                 memset(page + len, 0, XLOG_BLCKSZ - len);
7278
7279                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7280                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7281         }
7282         else
7283         {
7284                 /*
7285                  * There is no partial block to copy. Just set InitializedUpTo,
7286                  * and let the first attempt to insert a log record to initialize
7287                  * the next buffer.
7288                  */
7289                 XLogCtl->InitializedUpTo = EndOfLog;
7290         }
7291
7292         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7293
7294         XLogCtl->LogwrtResult = LogwrtResult;
7295
7296         XLogCtl->LogwrtRqst.Write = EndOfLog;
7297         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7298
7299         /* Pre-scan prepared transactions to find out the range of XIDs present */
7300         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7301
7302         /*
7303          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7304          * record before resource manager writes cleanup WAL records or checkpoint
7305          * record is written.
7306          */
7307         Insert->fullPageWrites = lastFullPageWrites;
7308         LocalSetXLogInsertAllowed();
7309         UpdateFullPageWrites();
7310         LocalXLogInsertAllowed = -1;
7311
7312         if (InRecovery)
7313         {
7314                 int                     rmid;
7315
7316                 /*
7317                  * Resource managers might need to write WAL records, eg, to record
7318                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
7319                  * this process only.
7320                  */
7321                 LocalSetXLogInsertAllowed();
7322
7323                 /*
7324                  * Allow resource managers to do any required cleanup.
7325                  */
7326                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7327                 {
7328                         if (RmgrTable[rmid].rm_cleanup != NULL)
7329                                 RmgrTable[rmid].rm_cleanup();
7330                 }
7331
7332                 /* Disallow XLogInsert again */
7333                 LocalXLogInsertAllowed = -1;
7334
7335                 /*
7336                  * Perform a checkpoint to update all our recovery activity to disk.
7337                  *
7338                  * Note that we write a shutdown checkpoint rather than an on-line
7339                  * one. This is not particularly critical, but since we may be
7340                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7341                  * the rule that TLI only changes in shutdown checkpoints, which
7342                  * allows some extra error checking in xlog_redo.
7343                  *
7344                  * In fast promotion, only create a lightweight end-of-recovery record
7345                  * instead of a full checkpoint. A checkpoint is requested later,
7346                  * after we're fully out of recovery mode and already accepting
7347                  * queries.
7348                  */
7349                 if (bgwriterLaunched)
7350                 {
7351                         if (fast_promote)
7352                         {
7353                                 checkPointLoc = ControlFile->prevCheckPoint;
7354
7355                                 /*
7356                                  * Confirm the last checkpoint is available for us to recover
7357                                  * from if we fail. Note that we don't check for the secondary
7358                                  * checkpoint since that isn't available in most base backups.
7359                                  */
7360                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7361                                 if (record != NULL)
7362                                 {
7363                                         fast_promoted = true;
7364
7365                                         /*
7366                                          * Insert a special WAL record to mark the end of
7367                                          * recovery, since we aren't doing a checkpoint. That
7368                                          * means that the checkpointer process may likely be in
7369                                          * the middle of a time-smoothed restartpoint and could
7370                                          * continue to be for minutes after this. That sounds
7371                                          * strange, but the effect is roughly the same and it
7372                                          * would be stranger to try to come out of the
7373                                          * restartpoint and then checkpoint. We request a
7374                                          * checkpoint later anyway, just for safety.
7375                                          */
7376                                         CreateEndOfRecoveryRecord();
7377                                 }
7378                         }
7379
7380                         if (!fast_promoted)
7381                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7382                                                                   CHECKPOINT_IMMEDIATE |
7383                                                                   CHECKPOINT_WAIT);
7384                 }
7385                 else
7386                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7387
7388                 /*
7389                  * And finally, execute the recovery_end_command, if any.
7390                  */
7391                 if (recoveryEndCommand)
7392                         ExecuteRecoveryCommand(recoveryEndCommand,
7393                                                                    "recovery_end_command",
7394                                                                    true);
7395         }
7396
7397         /*
7398          * Preallocate additional log files, if wanted.
7399          */
7400         PreallocXlogFiles(EndOfLog);
7401
7402         /*
7403          * Reset initial contents of unlogged relations.  This has to be done
7404          * AFTER recovery is complete so that any unlogged relations created
7405          * during recovery also get picked up.
7406          */
7407         if (InRecovery)
7408                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7409
7410         /*
7411          * Okay, we're officially UP.
7412          */
7413         InRecovery = false;
7414
7415         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7416         ControlFile->state = DB_IN_PRODUCTION;
7417         ControlFile->time = (pg_time_t) time(NULL);
7418         UpdateControlFile();
7419         LWLockRelease(ControlFileLock);
7420
7421         /* start the archive_timeout timer running */
7422         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7423
7424         /* also initialize latestCompletedXid, to nextXid - 1 */
7425         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7426         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7427         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7428         LWLockRelease(ProcArrayLock);
7429
7430         /*
7431          * Start up the commit log and subtrans, if not already done for hot
7432          * standby.
7433          */
7434         if (standbyState == STANDBY_DISABLED)
7435         {
7436                 StartupCLOG();
7437                 StartupSUBTRANS(oldestActiveXID);
7438         }
7439
7440         /*
7441          * Perform end of recovery actions for any SLRUs that need it.
7442          */
7443         TrimCLOG();
7444         TrimMultiXact();
7445
7446         /* Reload shared-memory state for prepared transactions */
7447         RecoverPreparedTransactions();
7448
7449         /*
7450          * Shutdown the recovery environment. This must occur after
7451          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7452          */
7453         if (standbyState != STANDBY_DISABLED)
7454                 ShutdownRecoveryTransactionEnvironment();
7455
7456         /* Shut down xlogreader */
7457         if (readFile >= 0)
7458         {
7459                 close(readFile);
7460                 readFile = -1;
7461         }
7462         XLogReaderFree(xlogreader);
7463
7464         /*
7465          * If any of the critical GUCs have changed, log them before we allow
7466          * backends to write WAL.
7467          */
7468         LocalSetXLogInsertAllowed();
7469         XLogReportParameters();
7470
7471         /*
7472          * All done.  Allow backends to write WAL.      (Although the bool flag is
7473          * probably atomic in itself, we use the info_lck here to ensure that
7474          * there are no race conditions concerning visibility of other recent
7475          * updates to shared memory.)
7476          */
7477         {
7478                 /* use volatile pointer to prevent code rearrangement */
7479                 volatile XLogCtlData *xlogctl = XLogCtl;
7480
7481                 SpinLockAcquire(&xlogctl->info_lck);
7482                 xlogctl->SharedRecoveryInProgress = false;
7483                 SpinLockRelease(&xlogctl->info_lck);
7484         }
7485
7486         /*
7487          * If there were cascading standby servers connected to us, nudge any wal
7488          * sender processes to notice that we've been promoted.
7489          */
7490         WalSndWakeup();
7491
7492         /*
7493          * If this was a fast promotion, request an (online) checkpoint now. This
7494          * isn't required for consistency, but the last restartpoint might be far
7495          * back, and in case of a crash, recovering from it might take a longer
7496          * than is appropriate now that we're not in standby mode anymore.
7497          */
7498         if (fast_promoted)
7499                 RequestCheckpoint(CHECKPOINT_FORCE);
7500 }
7501
7502 /*
7503  * Checks if recovery has reached a consistent state. When consistency is
7504  * reached and we have a valid starting standby snapshot, tell postmaster
7505  * that it can start accepting read-only connections.
7506  */
7507 static void
7508 CheckRecoveryConsistency(void)
7509 {
7510         XLogRecPtr lastReplayedEndRecPtr;
7511
7512         /*
7513          * During crash recovery, we don't reach a consistent state until we've
7514          * replayed all the WAL.
7515          */
7516         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7517                 return;
7518
7519         /*
7520          * assume that we are called in the startup process, and hence don't need
7521          * a lock to read lastReplayedEndRecPtr
7522          */
7523         lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7524
7525         /*
7526          * Have we reached the point where our base backup was completed?
7527          */
7528         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7529                 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7530         {
7531                 /*
7532                  * We have reached the end of base backup, as indicated by pg_control.
7533                  * The data on disk is now consistent. Reset backupStartPoint and
7534                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7535                  * allow starting up at an earlier point even if recovery is stopped
7536                  * and restarted soon after this.
7537                  */
7538                 elog(DEBUG1, "end of backup reached");
7539
7540                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7541
7542                 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7543                         ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7544
7545                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7546                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7547                 ControlFile->backupEndRequired = false;
7548                 UpdateControlFile();
7549
7550                 LWLockRelease(ControlFileLock);
7551         }
7552
7553         /*
7554          * Have we passed our safe starting point? Note that minRecoveryPoint is
7555          * known to be incorrectly set if ControlFile->backupEndRequired, until
7556          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7557          * minRecoveryPoint. All we know prior to that is that we're not
7558          * consistent yet.
7559          */
7560         if (!reachedConsistency && !ControlFile->backupEndRequired &&
7561                 minRecoveryPoint <= lastReplayedEndRecPtr &&
7562                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7563         {
7564                 /*
7565                  * Check to see if the XLOG sequence contained any unresolved
7566                  * references to uninitialized pages.
7567                  */
7568                 XLogCheckInvalidPages();
7569
7570                 reachedConsistency = true;
7571                 ereport(LOG,
7572                                 (errmsg("consistent recovery state reached at %X/%X",
7573                                                 (uint32) (lastReplayedEndRecPtr >> 32),
7574                                                 (uint32) lastReplayedEndRecPtr)));
7575         }
7576
7577         /*
7578          * Have we got a valid starting snapshot that will allow queries to be
7579          * run? If so, we can tell postmaster that the database is consistent now,
7580          * enabling connections.
7581          */
7582         if (standbyState == STANDBY_SNAPSHOT_READY &&
7583                 !LocalHotStandbyActive &&
7584                 reachedConsistency &&
7585                 IsUnderPostmaster)
7586         {
7587                 /* use volatile pointer to prevent code rearrangement */
7588                 volatile XLogCtlData *xlogctl = XLogCtl;
7589
7590                 SpinLockAcquire(&xlogctl->info_lck);
7591                 xlogctl->SharedHotStandbyActive = true;
7592                 SpinLockRelease(&xlogctl->info_lck);
7593
7594                 LocalHotStandbyActive = true;
7595
7596                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7597         }
7598 }
7599
7600 /*
7601  * Is the system still in recovery?
7602  *
7603  * Unlike testing InRecovery, this works in any process that's connected to
7604  * shared memory.
7605  *
7606  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7607  * variables the first time we see that recovery is finished.
7608  */
7609 bool
7610 RecoveryInProgress(void)
7611 {
7612         /*
7613          * We check shared state each time only until we leave recovery mode. We
7614          * can't re-enter recovery, so there's no need to keep checking after the
7615          * shared variable has once been seen false.
7616          */
7617         if (!LocalRecoveryInProgress)
7618                 return false;
7619         else
7620         {
7621                 /*
7622                  * use volatile pointer to make sure we make a fresh read of the
7623                  * shared variable.
7624                  */
7625                 volatile XLogCtlData *xlogctl = XLogCtl;
7626
7627                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7628
7629                 /*
7630                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7631                  * is finished. InitPostgres() relies upon this behaviour to ensure
7632                  * that InitXLOGAccess() is called at backend startup.  (If you change
7633                  * this, see also LocalSetXLogInsertAllowed.)
7634                  */
7635                 if (!LocalRecoveryInProgress)
7636                 {
7637                         /*
7638                          * If we just exited recovery, make sure we read TimeLineID and
7639                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
7640                          * weak memory ordering).
7641                          */
7642                         pg_memory_barrier();
7643                         InitXLOGAccess();
7644                 }
7645                 /*
7646                  * Note: We don't need a memory barrier when we're still in recovery.
7647                  * We might exit recovery immediately after return, so the caller
7648                  * can't rely on 'true' meaning that we're still in recovery anyway.
7649                  */
7650
7651                 return LocalRecoveryInProgress;
7652         }
7653 }
7654
7655 /*
7656  * Is HotStandby active yet? This is only important in special backends
7657  * since normal backends won't ever be able to connect until this returns
7658  * true. Postmaster knows this by way of signal, not via shared memory.
7659  *
7660  * Unlike testing standbyState, this works in any process that's connected to
7661  * shared memory.  (And note that standbyState alone doesn't tell the truth
7662  * anyway.)
7663  */
7664 bool
7665 HotStandbyActive(void)
7666 {
7667         /*
7668          * We check shared state each time only until Hot Standby is active. We
7669          * can't de-activate Hot Standby, so there's no need to keep checking
7670          * after the shared variable has once been seen true.
7671          */
7672         if (LocalHotStandbyActive)
7673                 return true;
7674         else
7675         {
7676                 /* use volatile pointer to prevent code rearrangement */
7677                 volatile XLogCtlData *xlogctl = XLogCtl;
7678
7679                 /* spinlock is essential on machines with weak memory ordering! */
7680                 SpinLockAcquire(&xlogctl->info_lck);
7681                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
7682                 SpinLockRelease(&xlogctl->info_lck);
7683
7684                 return LocalHotStandbyActive;
7685         }
7686 }
7687
7688 /*
7689  * Like HotStandbyActive(), but to be used only in WAL replay code,
7690  * where we don't need to ask any other process what the state is.
7691  */
7692 bool
7693 HotStandbyActiveInReplay(void)
7694 {
7695         Assert(AmStartupProcess());
7696         return LocalHotStandbyActive;
7697 }
7698
7699 /*
7700  * Is this process allowed to insert new WAL records?
7701  *
7702  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7703  * But we also have provisions for forcing the result "true" or "false"
7704  * within specific processes regardless of the global state.
7705  */
7706 bool
7707 XLogInsertAllowed(void)
7708 {
7709         /*
7710          * If value is "unconditionally true" or "unconditionally false", just
7711          * return it.  This provides the normal fast path once recovery is known
7712          * done.
7713          */
7714         if (LocalXLogInsertAllowed >= 0)
7715                 return (bool) LocalXLogInsertAllowed;
7716
7717         /*
7718          * Else, must check to see if we're still in recovery.
7719          */
7720         if (RecoveryInProgress())
7721                 return false;
7722
7723         /*
7724          * On exit from recovery, reset to "unconditionally true", since there is
7725          * no need to keep checking.
7726          */
7727         LocalXLogInsertAllowed = 1;
7728         return true;
7729 }
7730
7731 /*
7732  * Make XLogInsertAllowed() return true in the current process only.
7733  *
7734  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7735  * and even call LocalSetXLogInsertAllowed() again after that.
7736  */
7737 static void
7738 LocalSetXLogInsertAllowed(void)
7739 {
7740         Assert(LocalXLogInsertAllowed == -1);
7741         LocalXLogInsertAllowed = 1;
7742
7743         /* Initialize as RecoveryInProgress() would do when switching state */
7744         InitXLOGAccess();
7745 }
7746
7747 /*
7748  * Subroutine to try to fetch and validate a prior checkpoint record.
7749  *
7750  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7751  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7752  */
7753 static XLogRecord *
7754 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7755                                          int whichChkpt, bool report)
7756 {
7757         XLogRecord *record;
7758
7759         if (!XRecOffIsValid(RecPtr))
7760         {
7761                 if (!report)
7762                         return NULL;
7763
7764                 switch (whichChkpt)
7765                 {
7766                         case 1:
7767                                 ereport(LOG,
7768                                 (errmsg("invalid primary checkpoint link in control file")));
7769                                 break;
7770                         case 2:
7771                                 ereport(LOG,
7772                                                 (errmsg("invalid secondary checkpoint link in control file")));
7773                                 break;
7774                         default:
7775                                 ereport(LOG,
7776                                    (errmsg("invalid checkpoint link in backup_label file")));
7777                                 break;
7778                 }
7779                 return NULL;
7780         }
7781
7782         record = ReadRecord(xlogreader, RecPtr, LOG, true);
7783
7784         if (record == NULL)
7785         {
7786                 if (!report)
7787                         return NULL;
7788
7789                 switch (whichChkpt)
7790                 {
7791                         case 1:
7792                                 ereport(LOG,
7793                                                 (errmsg("invalid primary checkpoint record")));
7794                                 break;
7795                         case 2:
7796                                 ereport(LOG,
7797                                                 (errmsg("invalid secondary checkpoint record")));
7798                                 break;
7799                         default:
7800                                 ereport(LOG,
7801                                                 (errmsg("invalid checkpoint record")));
7802                                 break;
7803                 }
7804                 return NULL;
7805         }
7806         if (record->xl_rmid != RM_XLOG_ID)
7807         {
7808                 switch (whichChkpt)
7809                 {
7810                         case 1:
7811                                 ereport(LOG,
7812                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7813                                 break;
7814                         case 2:
7815                                 ereport(LOG,
7816                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7817                                 break;
7818                         default:
7819                                 ereport(LOG,
7820                                 (errmsg("invalid resource manager ID in checkpoint record")));
7821                                 break;
7822                 }
7823                 return NULL;
7824         }
7825         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7826                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7827         {
7828                 switch (whichChkpt)
7829                 {
7830                         case 1:
7831                                 ereport(LOG,
7832                                    (errmsg("invalid xl_info in primary checkpoint record")));
7833                                 break;
7834                         case 2:
7835                                 ereport(LOG,
7836                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7837                                 break;
7838                         default:
7839                                 ereport(LOG,
7840                                                 (errmsg("invalid xl_info in checkpoint record")));
7841                                 break;
7842                 }
7843                 return NULL;
7844         }
7845         if (record->xl_len != sizeof(CheckPoint) ||
7846                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
7847         {
7848                 switch (whichChkpt)
7849                 {
7850                         case 1:
7851                                 ereport(LOG,
7852                                         (errmsg("invalid length of primary checkpoint record")));
7853                                 break;
7854                         case 2:
7855                                 ereport(LOG,
7856                                   (errmsg("invalid length of secondary checkpoint record")));
7857                                 break;
7858                         default:
7859                                 ereport(LOG,
7860                                                 (errmsg("invalid length of checkpoint record")));
7861                                 break;
7862                 }
7863                 return NULL;
7864         }
7865         return record;
7866 }
7867
7868 /*
7869  * This must be called during startup of a backend process, except that
7870  * it need not be called in a standalone backend (which does StartupXLOG
7871  * instead).  We need to initialize the local copies of ThisTimeLineID and
7872  * RedoRecPtr.
7873  *
7874  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7875  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7876  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7877  */
7878 void
7879 InitXLOGAccess(void)
7880 {
7881         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7882         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7883         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7884
7885         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7886         (void) GetRedoRecPtr();
7887 }
7888
7889 /*
7890  * Return the current Redo pointer from shared memory.
7891  *
7892  * As a side-effect, the local RedoRecPtr copy is updated.
7893  */
7894 XLogRecPtr
7895 GetRedoRecPtr(void)
7896 {
7897         /* use volatile pointer to prevent code rearrangement */
7898         volatile XLogCtlData *xlogctl = XLogCtl;
7899         XLogRecPtr ptr;
7900
7901         /*
7902          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
7903          * grabbed a WAL insertion slot to read the master copy, someone might
7904          * update it just after we've released the lock.
7905          */
7906         SpinLockAcquire(&xlogctl->info_lck);
7907         ptr = xlogctl->RedoRecPtr;
7908         SpinLockRelease(&xlogctl->info_lck);
7909
7910         if (RedoRecPtr < ptr)
7911                 RedoRecPtr = ptr;
7912
7913         return RedoRecPtr;
7914 }
7915
7916 /*
7917  * GetInsertRecPtr -- Returns the current insert position.
7918  *
7919  * NOTE: The value *actually* returned is the position of the last full
7920  * xlog page. It lags behind the real insert position by at most 1 page.
7921  * For that, we don't need to scan through WAL insertion slots, and an
7922  * approximation is enough for the current usage of this function.
7923  */
7924 XLogRecPtr
7925 GetInsertRecPtr(void)
7926 {
7927         /* use volatile pointer to prevent code rearrangement */
7928         volatile XLogCtlData *xlogctl = XLogCtl;
7929         XLogRecPtr      recptr;
7930
7931         SpinLockAcquire(&xlogctl->info_lck);
7932         recptr = xlogctl->LogwrtRqst.Write;
7933         SpinLockRelease(&xlogctl->info_lck);
7934
7935         return recptr;
7936 }
7937
7938 /*
7939  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7940  * position known to be fsync'd to disk.
7941  */
7942 XLogRecPtr
7943 GetFlushRecPtr(void)
7944 {
7945         /* use volatile pointer to prevent code rearrangement */
7946         volatile XLogCtlData *xlogctl = XLogCtl;
7947         XLogRecPtr      recptr;
7948
7949         SpinLockAcquire(&xlogctl->info_lck);
7950         recptr = xlogctl->LogwrtResult.Flush;
7951         SpinLockRelease(&xlogctl->info_lck);
7952
7953         return recptr;
7954 }
7955
7956 /*
7957  * Get the time of the last xlog segment switch
7958  */
7959 pg_time_t
7960 GetLastSegSwitchTime(void)
7961 {
7962         pg_time_t       result;
7963
7964         /* Need WALWriteLock, but shared lock is sufficient */
7965         LWLockAcquire(WALWriteLock, LW_SHARED);
7966         result = XLogCtl->lastSegSwitchTime;
7967         LWLockRelease(WALWriteLock);
7968
7969         return result;
7970 }
7971
7972 /*
7973  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7974  *
7975  * This is exported for use by code that would like to have 64-bit XIDs.
7976  * We don't really support such things, but all XIDs within the system
7977  * can be presumed "close to" the result, and thus the epoch associated
7978  * with them can be determined.
7979  */
7980 void
7981 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7982 {
7983         uint32          ckptXidEpoch;
7984         TransactionId ckptXid;
7985         TransactionId nextXid;
7986
7987         /* Must read checkpoint info first, else have race condition */
7988         {
7989                 /* use volatile pointer to prevent code rearrangement */
7990                 volatile XLogCtlData *xlogctl = XLogCtl;
7991
7992                 SpinLockAcquire(&xlogctl->info_lck);
7993                 ckptXidEpoch = xlogctl->ckptXidEpoch;
7994                 ckptXid = xlogctl->ckptXid;
7995                 SpinLockRelease(&xlogctl->info_lck);
7996         }
7997
7998         /* Now fetch current nextXid */
7999         nextXid = ReadNewTransactionId();
8000
8001         /*
8002          * nextXid is certainly logically later than ckptXid.  So if it's
8003          * numerically less, it must have wrapped into the next epoch.
8004          */
8005         if (nextXid < ckptXid)
8006                 ckptXidEpoch++;
8007
8008         *xid = nextXid;
8009         *epoch = ckptXidEpoch;
8010 }
8011
8012 /*
8013  * This must be called ONCE during postmaster or standalone-backend shutdown
8014  */
8015 void
8016 ShutdownXLOG(int code, Datum arg)
8017 {
8018         /* Don't be chatty in standalone mode */
8019         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8020                         (errmsg("shutting down")));
8021
8022         if (RecoveryInProgress())
8023                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8024         else
8025         {
8026                 /*
8027                  * If archiving is enabled, rotate the last XLOG file so that all the
8028                  * remaining records are archived (postmaster wakes up the archiver
8029                  * process one more time at the end of shutdown). The checkpoint
8030                  * record will go to the next XLOG file and won't be archived (yet).
8031                  */
8032                 if (XLogArchivingActive() && XLogArchiveCommandSet())
8033                         RequestXLogSwitch();
8034
8035                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8036         }
8037         ShutdownCLOG();
8038         ShutdownSUBTRANS();
8039         ShutdownMultiXact();
8040
8041         /* Don't be chatty in standalone mode */
8042         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8043                         (errmsg("database system is shut down")));
8044 }
8045
8046 /*
8047  * Log start of a checkpoint.
8048  */
8049 static void
8050 LogCheckpointStart(int flags, bool restartpoint)
8051 {
8052         const char *msg;
8053
8054         /*
8055          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
8056          * the main message, but what about all the flags?
8057          */
8058         if (restartpoint)
8059                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
8060         else
8061                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
8062
8063         elog(LOG, msg,
8064                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8065                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8066                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8067                  (flags & CHECKPOINT_FORCE) ? " force" : "",
8068                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
8069                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8070                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
8071 }
8072
8073 /*
8074  * Log end of a checkpoint.
8075  */
8076 static void
8077 LogCheckpointEnd(bool restartpoint)
8078 {
8079         long            write_secs,
8080                                 sync_secs,
8081                                 total_secs,
8082                                 longest_secs,
8083                                 average_secs;
8084         int                     write_usecs,
8085                                 sync_usecs,
8086                                 total_usecs,
8087                                 longest_usecs,
8088                                 average_usecs;
8089         uint64          average_sync_time;
8090
8091         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8092
8093         TimestampDifference(CheckpointStats.ckpt_write_t,
8094                                                 CheckpointStats.ckpt_sync_t,
8095                                                 &write_secs, &write_usecs);
8096
8097         TimestampDifference(CheckpointStats.ckpt_sync_t,
8098                                                 CheckpointStats.ckpt_sync_end_t,
8099                                                 &sync_secs, &sync_usecs);
8100
8101         /* Accumulate checkpoint timing summary data, in milliseconds. */
8102         BgWriterStats.m_checkpoint_write_time +=
8103                 write_secs * 1000 + write_usecs / 1000;
8104         BgWriterStats.m_checkpoint_sync_time +=
8105                 sync_secs * 1000 + sync_usecs / 1000;
8106
8107         /*
8108          * All of the published timing statistics are accounted for.  Only
8109          * continue if a log message is to be written.
8110          */
8111         if (!log_checkpoints)
8112                 return;
8113
8114         TimestampDifference(CheckpointStats.ckpt_start_t,
8115                                                 CheckpointStats.ckpt_end_t,
8116                                                 &total_secs, &total_usecs);
8117
8118         /*
8119          * Timing values returned from CheckpointStats are in microseconds.
8120          * Convert to the second plus microsecond form that TimestampDifference
8121          * returns for homogeneous printing.
8122          */
8123         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
8124         longest_usecs = CheckpointStats.ckpt_longest_sync -
8125                 (uint64) longest_secs *1000000;
8126
8127         average_sync_time = 0;
8128         if (CheckpointStats.ckpt_sync_rels > 0)
8129                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8130                         CheckpointStats.ckpt_sync_rels;
8131         average_secs = (long) (average_sync_time / 1000000);
8132         average_usecs = average_sync_time - (uint64) average_secs *1000000;
8133
8134         if (restartpoint)
8135                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
8136                          "%d transaction log file(s) added, %d removed, %d recycled; "
8137                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8138                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
8139                          CheckpointStats.ckpt_bufs_written,
8140                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8141                          CheckpointStats.ckpt_segs_added,
8142                          CheckpointStats.ckpt_segs_removed,
8143                          CheckpointStats.ckpt_segs_recycled,
8144                          write_secs, write_usecs / 1000,
8145                          sync_secs, sync_usecs / 1000,
8146                          total_secs, total_usecs / 1000,
8147                          CheckpointStats.ckpt_sync_rels,
8148                          longest_secs, longest_usecs / 1000,
8149                          average_secs, average_usecs / 1000);
8150         else
8151                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
8152                          "%d transaction log file(s) added, %d removed, %d recycled; "
8153                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8154                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
8155                          CheckpointStats.ckpt_bufs_written,
8156                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8157                          CheckpointStats.ckpt_segs_added,
8158                          CheckpointStats.ckpt_segs_removed,
8159                          CheckpointStats.ckpt_segs_recycled,
8160                          write_secs, write_usecs / 1000,
8161                          sync_secs, sync_usecs / 1000,
8162                          total_secs, total_usecs / 1000,
8163                          CheckpointStats.ckpt_sync_rels,
8164                          longest_secs, longest_usecs / 1000,
8165                          average_secs, average_usecs / 1000);
8166 }
8167
8168 /*
8169  * Perform a checkpoint --- either during shutdown, or on-the-fly
8170  *
8171  * flags is a bitwise OR of the following:
8172  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8173  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8174  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8175  *              ignoring checkpoint_completion_target parameter.
8176  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8177  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8178  *              CHECKPOINT_END_OF_RECOVERY).
8179  *
8180  * Note: flags contains other bits, of interest here only for logging purposes.
8181  * In particular note that this routine is synchronous and does not pay
8182  * attention to CHECKPOINT_WAIT.
8183  *
8184  * If !shutdown then we are writing an online checkpoint. This is a very special
8185  * kind of operation and WAL record because the checkpoint action occurs over
8186  * a period of time yet logically occurs at just a single LSN. The logical
8187  * position of the WAL record (redo ptr) is the same or earlier than the
8188  * physical position. When we replay WAL we locate the checkpoint via its
8189  * physical position then read the redo ptr and actually start replay at the
8190  * earlier logical position. Note that we don't write *anything* to WAL at
8191  * the logical position, so that location could be any other kind of WAL record.
8192  * All of this mechanism allows us to continue working while we checkpoint.
8193  * As a result, timing of actions is critical here and be careful to note that
8194  * this function will likely take minutes to execute on a busy system.
8195  */
8196 void
8197 CreateCheckPoint(int flags)
8198 {
8199         /* use volatile pointer to prevent code rearrangement */
8200         volatile XLogCtlData *xlogctl = XLogCtl;
8201         bool            shutdown;
8202         CheckPoint      checkPoint;
8203         XLogRecPtr      recptr;
8204         XLogCtlInsert *Insert = &XLogCtl->Insert;
8205         XLogRecData rdata;
8206         uint32          freespace;
8207         XLogSegNo       _logSegNo;
8208         XLogRecPtr      curInsert;
8209         VirtualTransactionId *vxids;
8210         int                     nvxids;
8211
8212         /*
8213          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8214          * issued at a different time.
8215          */
8216         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8217                 shutdown = true;
8218         else
8219                 shutdown = false;
8220
8221         /* sanity check */
8222         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8223                 elog(ERROR, "can't create a checkpoint during recovery");
8224
8225         /*
8226          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8227          * (This is just pro forma, since in the present system structure there is
8228          * only one process that is allowed to issue checkpoints at any given
8229          * time.)
8230          */
8231         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8232
8233         /*
8234          * Prepare to accumulate statistics.
8235          *
8236          * Note: because it is possible for log_checkpoints to change while a
8237          * checkpoint proceeds, we always accumulate stats, even if
8238          * log_checkpoints is currently off.
8239          */
8240         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8241         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8242
8243         /*
8244          * Use a critical section to force system panic if we have trouble.
8245          */
8246         START_CRIT_SECTION();
8247
8248         if (shutdown)
8249         {
8250                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8251                 ControlFile->state = DB_SHUTDOWNING;
8252                 ControlFile->time = (pg_time_t) time(NULL);
8253                 UpdateControlFile();
8254                 LWLockRelease(ControlFileLock);
8255         }
8256
8257         /*
8258          * Let smgr prepare for checkpoint; this has to happen before we determine
8259          * the REDO pointer.  Note that smgr must not do anything that'd have to
8260          * be undone if we decide no checkpoint is needed.
8261          */
8262         smgrpreckpt();
8263
8264         /* Begin filling in the checkpoint WAL record */
8265         MemSet(&checkPoint, 0, sizeof(checkPoint));
8266         checkPoint.time = (pg_time_t) time(NULL);
8267
8268         /*
8269          * For Hot Standby, derive the oldestActiveXid before we fix the redo
8270          * pointer. This allows us to begin accumulating changes to assemble our
8271          * starting snapshot of locks and transactions.
8272          */
8273         if (!shutdown && XLogStandbyInfoActive())
8274                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8275         else
8276                 checkPoint.oldestActiveXid = InvalidTransactionId;
8277
8278         /*
8279          * We must block concurrent insertions while examining insert state to
8280          * determine the checkpoint REDO pointer.
8281          */
8282         WALInsertSlotAcquire(true);
8283         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8284
8285         /*
8286          * If this isn't a shutdown or forced checkpoint, and we have not inserted
8287          * any XLOG records since the start of the last checkpoint, skip the
8288          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
8289          * when the system is idle. That wastes log space, and more importantly it
8290          * exposes us to possible loss of both current and previous checkpoint
8291          * records if the machine crashes just as we're writing the update.
8292          * (Perhaps it'd make even more sense to checkpoint only when the previous
8293          * checkpoint record is in a different xlog page?)
8294          *
8295          * We have to make two tests to determine that nothing has happened since
8296          * the start of the last checkpoint: current insertion point must match
8297          * the end of the last checkpoint record, and its redo pointer must point
8298          * to itself.
8299          */
8300         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8301                                   CHECKPOINT_FORCE)) == 0)
8302         {
8303                 if (curInsert == ControlFile->checkPoint +
8304                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
8305                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
8306                 {
8307                         WALInsertSlotRelease();
8308                         LWLockRelease(CheckpointLock);
8309                         END_CRIT_SECTION();
8310                         return;
8311                 }
8312         }
8313
8314         /*
8315          * An end-of-recovery checkpoint is created before anyone is allowed to
8316          * write WAL. To allow us to write the checkpoint record, temporarily
8317          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8318          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8319          */
8320         if (flags & CHECKPOINT_END_OF_RECOVERY)
8321                 LocalSetXLogInsertAllowed();
8322
8323         checkPoint.ThisTimeLineID = ThisTimeLineID;
8324         if (flags & CHECKPOINT_END_OF_RECOVERY)
8325                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8326         else
8327                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8328
8329         checkPoint.fullPageWrites = Insert->fullPageWrites;
8330
8331         /*
8332          * Compute new REDO record ptr = location of next XLOG record.
8333          *
8334          * NB: this is NOT necessarily where the checkpoint record itself will be,
8335          * since other backends may insert more XLOG records while we're off doing
8336          * the buffer flush work.  Those XLOG records are logically after the
8337          * checkpoint, even though physically before it.  Got that?
8338          */
8339         freespace = INSERT_FREESPACE(curInsert);
8340         if (freespace == 0)
8341         {
8342                 if (curInsert % XLogSegSize == 0)
8343                         curInsert += SizeOfXLogLongPHD;
8344                 else
8345                         curInsert += SizeOfXLogShortPHD;
8346         }
8347         checkPoint.redo = curInsert;
8348
8349         /*
8350          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8351          * must be done while holding the insertion slots.
8352          *
8353          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8354          * pointing past where it really needs to point.  This is okay; the only
8355          * consequence is that XLogInsert might back up whole buffers that it
8356          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8357          * XLogInserts that happen while we are dumping buffers must assume that
8358          * their buffer changes are not included in the checkpoint.
8359          */
8360         RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
8361
8362         /*
8363          * Now we can release the WAL insertion slots, allowing other xacts to
8364          * proceed while we are flushing disk buffers.
8365          */
8366         WALInsertSlotRelease();
8367
8368         /* Update the info_lck-protected copy of RedoRecPtr as well */
8369         SpinLockAcquire(&xlogctl->info_lck);
8370         xlogctl->RedoRecPtr = checkPoint.redo;
8371         SpinLockRelease(&xlogctl->info_lck);
8372
8373         /*
8374          * If enabled, log checkpoint start.  We postpone this until now so as not
8375          * to log anything if we decided to skip the checkpoint.
8376          */
8377         if (log_checkpoints)
8378                 LogCheckpointStart(flags, false);
8379
8380         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8381
8382         /*
8383          * In some cases there are groups of actions that must all occur on one
8384          * side or the other of a checkpoint record. Before flushing the
8385          * checkpoint record we must explicitly wait for any backend currently
8386          * performing those groups of actions.
8387          *
8388          * One example is end of transaction, so we must wait for any transactions
8389          * that are currently in commit critical sections.      If an xact inserted
8390          * its commit record into XLOG just before the REDO point, then a crash
8391          * restart from the REDO point would not replay that record, which means
8392          * that our flushing had better include the xact's update of pg_clog.  So
8393          * we wait till he's out of his commit critical section before proceeding.
8394          * See notes in RecordTransactionCommit().
8395          *
8396          * Because we've already released the insertion slots, this test is a bit
8397          * fuzzy: it is possible that we will wait for xacts we didn't really need
8398          * to wait for.  But the delay should be short and it seems better to make
8399          * checkpoint take a bit longer than to hold off insertions longer than
8400          * necessary.
8401          * (In fact, the whole reason we have this issue is that xact.c does
8402          * commit record XLOG insertion and clog update as two separate steps
8403          * protected by different locks, but again that seems best on grounds of
8404          * minimizing lock contention.)
8405          *
8406          * A transaction that has not yet set delayChkpt when we look cannot be at
8407          * risk, since he's not inserted his commit record yet; and one that's
8408          * already cleared it is not at risk either, since he's done fixing clog
8409          * and we will correctly flush the update below.  So we cannot miss any
8410          * xacts we need to wait for.
8411          */
8412         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8413         if (nvxids > 0)
8414         {
8415                 do
8416                 {
8417                         pg_usleep(10000L);      /* wait for 10 msec */
8418                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8419         }
8420         pfree(vxids);
8421
8422         /*
8423          * Get the other info we need for the checkpoint record.
8424          */
8425         LWLockAcquire(XidGenLock, LW_SHARED);
8426         checkPoint.nextXid = ShmemVariableCache->nextXid;
8427         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8428         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8429         LWLockRelease(XidGenLock);
8430
8431         /* Increase XID epoch if we've wrapped around since last checkpoint */
8432         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8433         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8434                 checkPoint.nextXidEpoch++;
8435
8436         LWLockAcquire(OidGenLock, LW_SHARED);
8437         checkPoint.nextOid = ShmemVariableCache->nextOid;
8438         if (!shutdown)
8439                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8440         LWLockRelease(OidGenLock);
8441
8442         MultiXactGetCheckptMulti(shutdown,
8443                                                          &checkPoint.nextMulti,
8444                                                          &checkPoint.nextMultiOffset,
8445                                                          &checkPoint.oldestMulti,
8446                                                          &checkPoint.oldestMultiDB);
8447
8448         /*
8449          * Having constructed the checkpoint record, ensure all shmem disk buffers
8450          * and commit-log buffers are flushed to disk.
8451          *
8452          * This I/O could fail for various reasons.  If so, we will fail to
8453          * complete the checkpoint, but there is no reason to force a system
8454          * panic. Accordingly, exit critical section while doing it.
8455          */
8456         END_CRIT_SECTION();
8457
8458         CheckPointGuts(checkPoint.redo, flags);
8459
8460         /*
8461          * Take a snapshot of running transactions and write this to WAL. This
8462          * allows us to reconstruct the state of running transactions during
8463          * archive recovery, if required. Skip, if this info disabled.
8464          *
8465          * If we are shutting down, or Startup process is completing crash
8466          * recovery we don't need to write running xact data.
8467          */
8468         if (!shutdown && XLogStandbyInfoActive())
8469                 LogStandbySnapshot();
8470
8471         START_CRIT_SECTION();
8472
8473         /*
8474          * Now insert the checkpoint record into XLOG.
8475          */
8476         rdata.data = (char *) (&checkPoint);
8477         rdata.len = sizeof(checkPoint);
8478         rdata.buffer = InvalidBuffer;
8479         rdata.next = NULL;
8480
8481         recptr = XLogInsert(RM_XLOG_ID,
8482                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8483                                                 XLOG_CHECKPOINT_ONLINE,
8484                                                 &rdata);
8485
8486         XLogFlush(recptr);
8487
8488         /*
8489          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8490          * overwritten at next startup.  No-one should even try, this just allows
8491          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8492          * to just temporarily disable writing until the system has exited
8493          * recovery.
8494          */
8495         if (shutdown)
8496         {
8497                 if (flags & CHECKPOINT_END_OF_RECOVERY)
8498                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
8499                 else
8500                         LocalXLogInsertAllowed = 0; /* never again write WAL */
8501         }
8502
8503         /*
8504          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8505          * = end of actual checkpoint record.
8506          */
8507         if (shutdown && checkPoint.redo != ProcLastRecPtr)
8508                 ereport(PANIC,
8509                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
8510
8511         /*
8512          * Select point at which we can truncate the log, which we base on the
8513          * prior checkpoint's earliest info.
8514          */
8515         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8516
8517         /*
8518          * Update the control file.
8519          */
8520         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8521         if (shutdown)
8522                 ControlFile->state = DB_SHUTDOWNED;
8523         ControlFile->prevCheckPoint = ControlFile->checkPoint;
8524         ControlFile->checkPoint = ProcLastRecPtr;
8525         ControlFile->checkPointCopy = checkPoint;
8526         ControlFile->time = (pg_time_t) time(NULL);
8527         /* crash recovery should always recover to the end of WAL */
8528         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8529         ControlFile->minRecoveryPointTLI = 0;
8530
8531         /*
8532          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8533          * unused on non-shutdown checkpoints, but seems useful to store it always
8534          * for debugging purposes.
8535          */
8536         SpinLockAcquire(&XLogCtl->ulsn_lck);
8537         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8538         SpinLockRelease(&XLogCtl->ulsn_lck);
8539
8540         UpdateControlFile();
8541         LWLockRelease(ControlFileLock);
8542
8543         /* Update shared-memory copy of checkpoint XID/epoch */
8544         {
8545                 /* use volatile pointer to prevent code rearrangement */
8546                 volatile XLogCtlData *xlogctl = XLogCtl;
8547
8548                 SpinLockAcquire(&xlogctl->info_lck);
8549                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8550                 xlogctl->ckptXid = checkPoint.nextXid;
8551                 SpinLockRelease(&xlogctl->info_lck);
8552         }
8553
8554         /*
8555          * We are now done with critical updates; no need for system panic if we
8556          * have trouble while fooling with old log segments.
8557          */
8558         END_CRIT_SECTION();
8559
8560         /*
8561          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8562          */
8563         smgrpostckpt();
8564
8565         /*
8566          * Delete old log files (those no longer needed even for previous
8567          * checkpoint or the standbys in XLOG streaming).
8568          */
8569         if (_logSegNo)
8570         {
8571                 KeepLogSeg(recptr, &_logSegNo);
8572                 _logSegNo--;
8573                 RemoveOldXlogFiles(_logSegNo, recptr);
8574         }
8575
8576         /*
8577          * Make more log segments if needed.  (Do this after recycling old log
8578          * segments, since that may supply some of the needed files.)
8579          */
8580         if (!shutdown)
8581                 PreallocXlogFiles(recptr);
8582
8583         /*
8584          * Truncate pg_subtrans if possible.  We can throw away all data before
8585          * the oldest XMIN of any running transaction.  No future transaction will
8586          * attempt to reference any pg_subtrans entry older than that (see Asserts
8587          * in subtrans.c).      During recovery, though, we mustn't do this because
8588          * StartupSUBTRANS hasn't been called yet.
8589          */
8590         if (!RecoveryInProgress())
8591                 TruncateSUBTRANS(GetOldestXmin(true, false));
8592
8593         /* Real work is done, but log and update stats before releasing lock. */
8594         LogCheckpointEnd(false);
8595
8596         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8597                                                                          NBuffers,
8598                                                                          CheckpointStats.ckpt_segs_added,
8599                                                                          CheckpointStats.ckpt_segs_removed,
8600                                                                          CheckpointStats.ckpt_segs_recycled);
8601
8602         LWLockRelease(CheckpointLock);
8603 }
8604
8605 /*
8606  * Mark the end of recovery in WAL though without running a full checkpoint.
8607  * We can expect that a restartpoint is likely to be in progress as we
8608  * do this, though we are unwilling to wait for it to complete. So be
8609  * careful to avoid taking the CheckpointLock anywhere here.
8610  *
8611  * CreateRestartPoint() allows for the case where recovery may end before
8612  * the restartpoint completes so there is no concern of concurrent behaviour.
8613  */
8614 void
8615 CreateEndOfRecoveryRecord(void)
8616 {
8617         xl_end_of_recovery xlrec;
8618         XLogRecData rdata;
8619         XLogRecPtr      recptr;
8620
8621         /* sanity check */
8622         if (!RecoveryInProgress())
8623                 elog(ERROR, "can only be used to end recovery");
8624
8625         xlrec.end_time = time(NULL);
8626
8627         WALInsertSlotAcquire(true);
8628         xlrec.ThisTimeLineID = ThisTimeLineID;
8629         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8630         WALInsertSlotRelease();
8631
8632         LocalSetXLogInsertAllowed();
8633
8634         START_CRIT_SECTION();
8635
8636         rdata.data = (char *) &xlrec;
8637         rdata.len = sizeof(xl_end_of_recovery);
8638         rdata.buffer = InvalidBuffer;
8639         rdata.next = NULL;
8640
8641         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
8642
8643         XLogFlush(recptr);
8644
8645         /*
8646          * Update the control file so that crash recovery can follow the timeline
8647          * changes to this point.
8648          */
8649         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8650         ControlFile->time = (pg_time_t) xlrec.end_time;
8651         ControlFile->minRecoveryPoint = recptr;
8652         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8653         UpdateControlFile();
8654         LWLockRelease(ControlFileLock);
8655
8656         END_CRIT_SECTION();
8657
8658         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8659 }
8660
8661 /*
8662  * Flush all data in shared memory to disk, and fsync
8663  *
8664  * This is the common code shared between regular checkpoints and
8665  * recovery restartpoints.
8666  */
8667 static void
8668 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8669 {
8670         CheckPointCLOG();
8671         CheckPointSUBTRANS();
8672         CheckPointMultiXact();
8673         CheckPointPredicate();
8674         CheckPointRelationMap();
8675         CheckPointReplicationSlots();
8676         CheckPointBuffers(flags);       /* performs all required fsyncs */
8677         /* We deliberately delay 2PC checkpointing as long as possible */
8678         CheckPointTwoPhase(checkPointRedo);
8679 }
8680
8681 /*
8682  * Save a checkpoint for recovery restart if appropriate
8683  *
8684  * This function is called each time a checkpoint record is read from XLOG.
8685  * It must determine whether the checkpoint represents a safe restartpoint or
8686  * not.  If so, the checkpoint record is stashed in shared memory so that
8687  * CreateRestartPoint can consult it.  (Note that the latter function is
8688  * executed by the checkpointer, while this one will be executed by the
8689  * startup process.)
8690  */
8691 static void
8692 RecoveryRestartPoint(const CheckPoint *checkPoint)
8693 {
8694         int                     rmid;
8695
8696         /* use volatile pointer to prevent code rearrangement */
8697         volatile XLogCtlData *xlogctl = XLogCtl;
8698
8699         /*
8700          * Is it safe to restartpoint?  We must ask each of the resource managers
8701          * whether they have any partial state information that might prevent a
8702          * correct restart from this point.  If so, we skip this opportunity, but
8703          * return at the next checkpoint record for another try.
8704          */
8705         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
8706         {
8707                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
8708                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
8709                         {
8710                                 elog(trace_recovery(DEBUG2),
8711                                          "RM %d not safe to record restart point at %X/%X",
8712                                          rmid,
8713                                          (uint32) (checkPoint->redo >> 32),
8714                                          (uint32) checkPoint->redo);
8715                                 return;
8716                         }
8717         }
8718
8719         /*
8720          * Also refrain from creating a restartpoint if we have seen any
8721          * references to non-existent pages. Restarting recovery from the
8722          * restartpoint would not see the references, so we would lose the
8723          * cross-check that the pages belonged to a relation that was dropped
8724          * later.
8725          */
8726         if (XLogHaveInvalidPages())
8727         {
8728                 elog(trace_recovery(DEBUG2),
8729                          "could not record restart point at %X/%X because there "
8730                          "are unresolved references to invalid pages",
8731                          (uint32) (checkPoint->redo >> 32),
8732                          (uint32) checkPoint->redo);
8733                 return;
8734         }
8735
8736         /*
8737          * Copy the checkpoint record to shared memory, so that checkpointer can
8738          * work out the next time it wants to perform a restartpoint.
8739          */
8740         SpinLockAcquire(&xlogctl->info_lck);
8741         xlogctl->lastCheckPointRecPtr = ReadRecPtr;
8742         xlogctl->lastCheckPoint = *checkPoint;
8743         SpinLockRelease(&xlogctl->info_lck);
8744 }
8745
8746 /*
8747  * Establish a restartpoint if possible.
8748  *
8749  * This is similar to CreateCheckPoint, but is used during WAL recovery
8750  * to establish a point from which recovery can roll forward without
8751  * replaying the entire recovery log.
8752  *
8753  * Returns true if a new restartpoint was established. We can only establish
8754  * a restartpoint if we have replayed a safe checkpoint record since last
8755  * restartpoint.
8756  */
8757 bool
8758 CreateRestartPoint(int flags)
8759 {
8760         XLogRecPtr      lastCheckPointRecPtr;
8761         CheckPoint      lastCheckPoint;
8762         XLogSegNo       _logSegNo;
8763         TimestampTz xtime;
8764
8765         /* use volatile pointer to prevent code rearrangement */
8766         volatile XLogCtlData *xlogctl = XLogCtl;
8767
8768         /*
8769          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8770          * happens at a time.
8771          */
8772         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8773
8774         /* Get a local copy of the last safe checkpoint record. */
8775         SpinLockAcquire(&xlogctl->info_lck);
8776         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
8777         lastCheckPoint = xlogctl->lastCheckPoint;
8778         SpinLockRelease(&xlogctl->info_lck);
8779
8780         /*
8781          * Check that we're still in recovery mode. It's ok if we exit recovery
8782          * mode after this check, the restart point is valid anyway.
8783          */
8784         if (!RecoveryInProgress())
8785         {
8786                 ereport(DEBUG2,
8787                           (errmsg("skipping restartpoint, recovery has already ended")));
8788                 LWLockRelease(CheckpointLock);
8789                 return false;
8790         }
8791
8792         /*
8793          * If the last checkpoint record we've replayed is already our last
8794          * restartpoint, we can't perform a new restart point. We still update
8795          * minRecoveryPoint in that case, so that if this is a shutdown restart
8796          * point, we won't start up earlier than before. That's not strictly
8797          * necessary, but when hot standby is enabled, it would be rather weird if
8798          * the database opened up for read-only connections at a point-in-time
8799          * before the last shutdown. Such time travel is still possible in case of
8800          * immediate shutdown, though.
8801          *
8802          * We don't explicitly advance minRecoveryPoint when we do create a
8803          * restartpoint. It's assumed that flushing the buffers will do that as a
8804          * side-effect.
8805          */
8806         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8807                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
8808         {
8809                 ereport(DEBUG2,
8810                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8811                                                 (uint32) (lastCheckPoint.redo >> 32),
8812                                                 (uint32) lastCheckPoint.redo)));
8813
8814                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8815                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8816                 {
8817                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8818                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8819                         ControlFile->time = (pg_time_t) time(NULL);
8820                         UpdateControlFile();
8821                         LWLockRelease(ControlFileLock);
8822                 }
8823                 LWLockRelease(CheckpointLock);
8824                 return false;
8825         }
8826
8827         /*
8828          * Update the shared RedoRecPtr so that the startup process can calculate
8829          * the number of segments replayed since last restartpoint, and request a
8830          * restartpoint if it exceeds checkpoint_segments.
8831          *
8832          * Like in CreateCheckPoint(), hold off insertions to update it, although
8833          * during recovery this is just pro forma, because no WAL insertions are
8834          * happening.
8835          */
8836         WALInsertSlotAcquire(true);
8837         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
8838         WALInsertSlotRelease();
8839
8840         /* Also update the info_lck-protected copy */
8841         SpinLockAcquire(&xlogctl->info_lck);
8842         xlogctl->RedoRecPtr = lastCheckPoint.redo;
8843         SpinLockRelease(&xlogctl->info_lck);
8844
8845         /*
8846          * Prepare to accumulate statistics.
8847          *
8848          * Note: because it is possible for log_checkpoints to change while a
8849          * checkpoint proceeds, we always accumulate stats, even if
8850          * log_checkpoints is currently off.
8851          */
8852         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8853         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8854
8855         if (log_checkpoints)
8856                 LogCheckpointStart(flags, true);
8857
8858         CheckPointGuts(lastCheckPoint.redo, flags);
8859
8860         /*
8861          * Select point at which we can truncate the xlog, which we base on the
8862          * prior checkpoint's earliest info.
8863          */
8864         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8865
8866         /*
8867          * Update pg_control, using current time.  Check that it still shows
8868          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8869          * this is a quick hack to make sure nothing really bad happens if somehow
8870          * we get here after the end-of-recovery checkpoint.
8871          */
8872         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8873         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8874                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
8875         {
8876                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8877                 ControlFile->checkPoint = lastCheckPointRecPtr;
8878                 ControlFile->checkPointCopy = lastCheckPoint;
8879                 ControlFile->time = (pg_time_t) time(NULL);
8880                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8881                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8882                 UpdateControlFile();
8883         }
8884         LWLockRelease(ControlFileLock);
8885
8886         /*
8887          * Due to an historical accident multixact truncations are not WAL-logged,
8888          * but just performed everytime the mxact horizon is increased. So, unless
8889          * we explicitly execute truncations on a standby it will never clean out
8890          * /pg_multixact which obviously is bad, both because it uses space and
8891          * because we can wrap around into pre-existing data...
8892          *
8893          * We can only do the truncation here, after the UpdateControlFile()
8894          * above, because we've now safely established a restart point, that
8895          * guarantees we will not need need to access those multis.
8896          *
8897          * It's probably worth improving this.
8898          */
8899         TruncateMultiXact(lastCheckPoint.oldestMulti);
8900
8901         /*
8902          * Delete old log files (those no longer needed even for previous
8903          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8904          * growing full.
8905          */
8906         if (_logSegNo)
8907         {
8908                 XLogRecPtr      receivePtr;
8909                 XLogRecPtr      replayPtr;
8910                 TimeLineID      replayTLI;
8911                 XLogRecPtr      endptr;
8912
8913                 /*
8914                  * Get the current end of xlog replayed or received, whichever is
8915                  * later.
8916                  */
8917                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
8918                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
8919                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
8920
8921                 KeepLogSeg(endptr, &_logSegNo);
8922                 _logSegNo--;
8923
8924                 /*
8925                  * Try to recycle segments on a useful timeline. If we've been promoted
8926                  * since the beginning of this restartpoint, use the new timeline
8927                  * chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
8928                  * in that case). If we're still in recovery, use the timeline we're
8929                  * currently replaying.
8930                  *
8931                  * There is no guarantee that the WAL segments will be useful on the
8932                  * current timeline; if recovery proceeds to a new timeline right
8933                  * after this, the pre-allocated WAL segments on this timeline will
8934                  * not be used, and will go wasted until recycled on the next
8935                  * restartpoint. We'll live with that.
8936                  */
8937                 if (RecoveryInProgress())
8938                         ThisTimeLineID = replayTLI;
8939
8940                 RemoveOldXlogFiles(_logSegNo, endptr);
8941
8942                 /*
8943                  * Make more log segments if needed.  (Do this after recycling old log
8944                  * segments, since that may supply some of the needed files.)
8945                  */
8946                 PreallocXlogFiles(endptr);
8947
8948                 /*
8949                  * ThisTimeLineID is normally not set when we're still in recovery.
8950                  * However, recycling/preallocating segments above needed
8951                  * ThisTimeLineID to determine which timeline to install the segments
8952                  * on. Reset it now, to restore the normal state of affairs for
8953                  * debugging purposes.
8954                  */
8955                 if (RecoveryInProgress())
8956                         ThisTimeLineID = 0;
8957         }
8958
8959         /*
8960          * Truncate pg_subtrans if possible.  We can throw away all data before
8961          * the oldest XMIN of any running transaction.  No future transaction will
8962          * attempt to reference any pg_subtrans entry older than that (see Asserts
8963          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
8964          * this because StartupSUBTRANS hasn't been called yet.
8965          */
8966         if (EnableHotStandby)
8967                 TruncateSUBTRANS(GetOldestXmin(true, false));
8968
8969         /* Real work is done, but log and update before releasing lock. */
8970         LogCheckpointEnd(true);
8971
8972         xtime = GetLatestXTime();
8973         ereport((log_checkpoints ? LOG : DEBUG2),
8974                         (errmsg("recovery restart point at %X/%X",
8975                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
8976                    xtime ? errdetail("last completed transaction was at log time %s",
8977                                                          timestamptz_to_str(xtime)) : 0));
8978
8979         LWLockRelease(CheckpointLock);
8980
8981         /*
8982          * Finally, execute archive_cleanup_command, if any.
8983          */
8984         if (XLogCtl->archiveCleanupCommand[0])
8985                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8986                                                            "archive_cleanup_command",
8987                                                            false);
8988
8989         return true;
8990 }
8991
8992 /*
8993  * Retreat *logSegNo to the last segment that we need to retain because of
8994  * either wal_keep_segments or replication slots.
8995  *
8996  * This is calculated by subtracting wal_keep_segments from the given xlog
8997  * location, recptr and by making sure that that result is below the
8998  * requirement of replication slots.
8999  */
9000 static void
9001 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9002 {
9003         XLogSegNo       segno;
9004         XLogRecPtr      keep;
9005
9006         XLByteToSeg(recptr, segno);
9007         keep = XLogGetReplicationSlotMinimumLSN();
9008
9009         /* compute limit for wal_keep_segments first */
9010         if (wal_keep_segments > 0)
9011         {
9012                 /* avoid underflow, don't go below 1 */
9013                 if (segno <= wal_keep_segments)
9014                         segno = 1;
9015                 else
9016                         segno = segno - wal_keep_segments;
9017         }
9018
9019         /* then check whether slots limit removal further */
9020         if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9021         {
9022                 XLogRecPtr slotSegNo;
9023
9024                 XLByteToSeg(keep, slotSegNo);
9025
9026                 if (slotSegNo <= 0)
9027                         segno = 1;
9028                 else if (slotSegNo < segno)
9029                         segno = slotSegNo;
9030         }
9031
9032         /* don't delete WAL segments newer than the calculated segment */
9033         if (segno < *logSegNo)
9034                 *logSegNo = segno;
9035 }
9036
9037 /*
9038  * Write a NEXTOID log record
9039  */
9040 void
9041 XLogPutNextOid(Oid nextOid)
9042 {
9043         XLogRecData rdata;
9044
9045         rdata.data = (char *) (&nextOid);
9046         rdata.len = sizeof(Oid);
9047         rdata.buffer = InvalidBuffer;
9048         rdata.next = NULL;
9049         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
9050
9051         /*
9052          * We need not flush the NEXTOID record immediately, because any of the
9053          * just-allocated OIDs could only reach disk as part of a tuple insert or
9054          * update that would have its own XLOG record that must follow the NEXTOID
9055          * record.      Therefore, the standard buffer LSN interlock applied to those
9056          * records will ensure no such OID reaches disk before the NEXTOID record
9057          * does.
9058          *
9059          * Note, however, that the above statement only covers state "within" the
9060          * database.  When we use a generated OID as a file or directory name, we
9061          * are in a sense violating the basic WAL rule, because that filesystem
9062          * change may reach disk before the NEXTOID WAL record does.  The impact
9063          * of this is that if a database crash occurs immediately afterward, we
9064          * might after restart re-generate the same OID and find that it conflicts
9065          * with the leftover file or directory.  But since for safety's sake we
9066          * always loop until finding a nonconflicting filename, this poses no real
9067          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9068          */
9069 }
9070
9071 /*
9072  * Write an XLOG SWITCH record.
9073  *
9074  * Here we just blindly issue an XLogInsert request for the record.
9075  * All the magic happens inside XLogInsert.
9076  *
9077  * The return value is either the end+1 address of the switch record,
9078  * or the end+1 address of the prior segment if we did not need to
9079  * write a switch record because we are already at segment start.
9080  */
9081 XLogRecPtr
9082 RequestXLogSwitch(void)
9083 {
9084         XLogRecPtr      RecPtr;
9085         XLogRecData rdata;
9086
9087         /* XLOG SWITCH, alone among xlog record types, has no data */
9088         rdata.buffer = InvalidBuffer;
9089         rdata.data = NULL;
9090         rdata.len = 0;
9091         rdata.next = NULL;
9092
9093         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
9094
9095         return RecPtr;
9096 }
9097
9098 /*
9099  * Write a RESTORE POINT record
9100  */
9101 XLogRecPtr
9102 XLogRestorePoint(const char *rpName)
9103 {
9104         XLogRecPtr      RecPtr;
9105         XLogRecData rdata;
9106         xl_restore_point xlrec;
9107
9108         xlrec.rp_time = GetCurrentTimestamp();
9109         strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9110
9111         rdata.buffer = InvalidBuffer;
9112         rdata.data = (char *) &xlrec;
9113         rdata.len = sizeof(xl_restore_point);
9114         rdata.next = NULL;
9115
9116         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
9117
9118         ereport(LOG,
9119                         (errmsg("restore point \"%s\" created at %X/%X",
9120                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9121
9122         return RecPtr;
9123 }
9124
9125 /*
9126  * Write a backup block if needed when we are setting a hint. Note that
9127  * this may be called for a variety of page types, not just heaps.
9128  *
9129  * Callable while holding just share lock on the buffer content.
9130  *
9131  * We can't use the plain backup block mechanism since that relies on the
9132  * Buffer being exclusively locked. Since some modifications (setting LSN, hint
9133  * bits) are allowed in a sharelocked buffer that can lead to wal checksum
9134  * failures. So instead we copy the page and insert the copied data as normal
9135  * record data.
9136  *
9137  * We only need to do something if page has not yet been full page written in
9138  * this checkpoint round. The LSN of the inserted wal record is returned if we
9139  * had to write, InvalidXLogRecPtr otherwise.
9140  *
9141  * It is possible that multiple concurrent backends could attempt to write WAL
9142  * records. In that case, multiple copies of the same block would be recorded
9143  * in separate WAL records by different backends, though that is still OK from
9144  * a correctness perspective.
9145  */
9146 XLogRecPtr
9147 XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
9148 {
9149         XLogRecPtr      recptr = InvalidXLogRecPtr;
9150         XLogRecPtr      lsn;
9151         XLogRecData rdata[2];
9152         BkpBlock        bkpb;
9153
9154         /*
9155          * Ensure no checkpoint can change our view of RedoRecPtr.
9156          */
9157         Assert(MyPgXact->delayChkpt);
9158
9159         /*
9160          * Update RedoRecPtr so XLogCheckBuffer can make the right decision
9161          */
9162         GetRedoRecPtr();
9163
9164         /*
9165          * Setup phony rdata element for use within XLogCheckBuffer only. We reuse
9166          * and reset rdata for any actual WAL record insert.
9167          */
9168         rdata[0].buffer = buffer;
9169         rdata[0].buffer_std = buffer_std;
9170
9171         /*
9172          * Check buffer while not holding an exclusive lock.
9173          */
9174         if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
9175         {
9176                 char            copied_buffer[BLCKSZ];
9177                 char       *origdata = (char *) BufferGetBlock(buffer);
9178
9179                 /*
9180                  * Copy buffer so we don't have to worry about concurrent hint bit or
9181                  * lsn updates. We assume pd_lower/upper cannot be changed without an
9182                  * exclusive lock, so the contents bkp are not racy.
9183                  *
9184                  * With buffer_std set to false, XLogCheckBuffer() sets hole_length and
9185                  * hole_offset to 0; so the following code is safe for either case.
9186                  */
9187                 memcpy(copied_buffer, origdata, bkpb.hole_offset);
9188                 memcpy(copied_buffer + bkpb.hole_offset,
9189                            origdata + bkpb.hole_offset + bkpb.hole_length,
9190                            BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
9191
9192                 /*
9193                  * Header for backup block.
9194                  */
9195                 rdata[0].data = (char *) &bkpb;
9196                 rdata[0].len = sizeof(BkpBlock);
9197                 rdata[0].buffer = InvalidBuffer;
9198                 rdata[0].next = &(rdata[1]);
9199
9200                 /*
9201                  * Save copy of the buffer.
9202                  */
9203                 rdata[1].data = copied_buffer;
9204                 rdata[1].len = BLCKSZ - bkpb.hole_length;
9205                 rdata[1].buffer = InvalidBuffer;
9206                 rdata[1].next = NULL;
9207
9208                 recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata);
9209         }
9210
9211         return recptr;
9212 }
9213
9214 /*
9215  * Check if any of the GUC parameters that are critical for hot standby
9216  * have changed, and update the value in pg_control file if necessary.
9217  */
9218 static void
9219 XLogReportParameters(void)
9220 {
9221         if (wal_level != ControlFile->wal_level ||
9222                 wal_log_hints != ControlFile->wal_log_hints ||
9223                 MaxConnections != ControlFile->MaxConnections ||
9224                 max_worker_processes != ControlFile->max_worker_processes ||
9225                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9226                 max_locks_per_xact != ControlFile->max_locks_per_xact)
9227         {
9228                 /*
9229                  * The change in number of backend slots doesn't need to be WAL-logged
9230                  * if archiving is not enabled, as you can't start archive recovery
9231                  * with wal_level=minimal anyway. We don't really care about the
9232                  * values in pg_control either if wal_level=minimal, but seems better
9233                  * to keep them up-to-date to avoid confusion.
9234                  */
9235                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9236                 {
9237                         XLogRecData rdata;
9238                         xl_parameter_change xlrec;
9239
9240                         xlrec.MaxConnections = MaxConnections;
9241                         xlrec.max_worker_processes = max_worker_processes;
9242                         xlrec.max_prepared_xacts = max_prepared_xacts;
9243                         xlrec.max_locks_per_xact = max_locks_per_xact;
9244                         xlrec.wal_level = wal_level;
9245                         xlrec.wal_log_hints = wal_log_hints;
9246
9247                         rdata.buffer = InvalidBuffer;
9248                         rdata.data = (char *) &xlrec;
9249                         rdata.len = sizeof(xlrec);
9250                         rdata.next = NULL;
9251
9252                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
9253                 }
9254
9255                 ControlFile->MaxConnections = MaxConnections;
9256                 ControlFile->max_worker_processes = max_worker_processes;
9257                 ControlFile->max_prepared_xacts = max_prepared_xacts;
9258                 ControlFile->max_locks_per_xact = max_locks_per_xact;
9259                 ControlFile->wal_level = wal_level;
9260                 ControlFile->wal_log_hints = wal_log_hints;
9261                 UpdateControlFile();
9262         }
9263 }
9264
9265 /*
9266  * Update full_page_writes in shared memory, and write an
9267  * XLOG_FPW_CHANGE record if necessary.
9268  *
9269  * Note: this function assumes there is no other process running
9270  * concurrently that could update it.
9271  */
9272 void
9273 UpdateFullPageWrites(void)
9274 {
9275         XLogCtlInsert *Insert = &XLogCtl->Insert;
9276
9277         /*
9278          * Do nothing if full_page_writes has not been changed.
9279          *
9280          * It's safe to check the shared full_page_writes without the lock,
9281          * because we assume that there is no concurrently running process which
9282          * can update it.
9283          */
9284         if (fullPageWrites == Insert->fullPageWrites)
9285                 return;
9286
9287         START_CRIT_SECTION();
9288
9289         /*
9290          * It's always safe to take full page images, even when not strictly
9291          * required, but not the other round. So if we're setting full_page_writes
9292          * to true, first set it true and then write the WAL record. If we're
9293          * setting it to false, first write the WAL record and then set the global
9294          * flag.
9295          */
9296         if (fullPageWrites)
9297         {
9298                 WALInsertSlotAcquire(true);
9299                 Insert->fullPageWrites = true;
9300                 WALInsertSlotRelease();
9301         }
9302
9303         /*
9304          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9305          * full_page_writes during archive recovery, if required.
9306          */
9307         if (XLogStandbyInfoActive() && !RecoveryInProgress())
9308         {
9309                 XLogRecData rdata;
9310
9311                 rdata.data = (char *) (&fullPageWrites);
9312                 rdata.len = sizeof(bool);
9313                 rdata.buffer = InvalidBuffer;
9314                 rdata.next = NULL;
9315
9316                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
9317         }
9318
9319         if (!fullPageWrites)
9320         {
9321                 WALInsertSlotAcquire(true);
9322                 Insert->fullPageWrites = false;
9323                 WALInsertSlotRelease();
9324         }
9325         END_CRIT_SECTION();
9326 }
9327
9328 /*
9329  * Check that it's OK to switch to new timeline during recovery.
9330  *
9331  * 'lsn' is the address of the shutdown checkpoint record we're about to
9332  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9333  */
9334 static void
9335 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9336 {
9337         /* Check that the record agrees on what the current (old) timeline is */
9338         if (prevTLI != ThisTimeLineID)
9339                 ereport(PANIC,
9340                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9341                                                 prevTLI, ThisTimeLineID)));
9342
9343         /*
9344          * The new timeline better be in the list of timelines we expect to see,
9345          * according to the timeline history. It should also not decrease.
9346          */
9347         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9348                 ereport(PANIC,
9349                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9350                                  newTLI, ThisTimeLineID)));
9351
9352         /*
9353          * If we have not yet reached min recovery point, and we're about to
9354          * switch to a timeline greater than the timeline of the min recovery
9355          * point: trouble. After switching to the new timeline, we could not
9356          * possibly visit the min recovery point on the correct timeline anymore.
9357          * This can happen if there is a newer timeline in the archive that
9358          * branched before the timeline the min recovery point is on, and you
9359          * attempt to do PITR to the new timeline.
9360          */
9361         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9362                 lsn < minRecoveryPoint &&
9363                 newTLI > minRecoveryPointTLI)
9364                 ereport(PANIC,
9365                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9366                                                 newTLI,
9367                                                 (uint32) (minRecoveryPoint >> 32),
9368                                                 (uint32) minRecoveryPoint,
9369                                                 minRecoveryPointTLI)));
9370
9371         /* Looks good */
9372 }
9373
9374 /*
9375  * XLOG resource manager's routines
9376  *
9377  * Definitions of info values are in include/catalog/pg_control.h, though
9378  * not all record types are related to control file updates.
9379  */
9380 void
9381 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
9382 {
9383         uint8           info = record->xl_info & ~XLR_INFO_MASK;
9384
9385         /* Backup blocks are not used by XLOG rmgr */
9386         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
9387
9388         if (info == XLOG_NEXTOID)
9389         {
9390                 Oid                     nextOid;
9391
9392                 /*
9393                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9394                  * and the recorded nextOid, but that fails if the OID counter wraps
9395                  * around.      Since no OID allocation should be happening during replay
9396                  * anyway, better to just believe the record exactly.  We still take
9397                  * OidGenLock while setting the variable, just in case.
9398                  */
9399                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9400                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9401                 ShmemVariableCache->nextOid = nextOid;
9402                 ShmemVariableCache->oidCount = 0;
9403                 LWLockRelease(OidGenLock);
9404         }
9405         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9406         {
9407                 CheckPoint      checkPoint;
9408
9409                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9410                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9411                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9412                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9413                 LWLockRelease(XidGenLock);
9414                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9415                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9416                 ShmemVariableCache->oidCount = 0;
9417                 LWLockRelease(OidGenLock);
9418                 MultiXactSetNextMXact(checkPoint.nextMulti,
9419                                                           checkPoint.nextMultiOffset);
9420                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9421                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
9422
9423                 /*
9424                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9425                  * record, the backup was canceled and the end-of-backup record will
9426                  * never arrive.
9427                  */
9428                 if (ArchiveRecoveryRequested &&
9429                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9430                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9431                         ereport(PANIC,
9432                         (errmsg("online backup was canceled, recovery cannot continue")));
9433
9434                 /*
9435                  * If we see a shutdown checkpoint, we know that nothing was running
9436                  * on the master at this point. So fake-up an empty running-xacts
9437                  * record and use that here and now. Recover additional standby state
9438                  * for prepared transactions.
9439                  */
9440                 if (standbyState >= STANDBY_INITIALIZED)
9441                 {
9442                         TransactionId *xids;
9443                         int                     nxids;
9444                         TransactionId oldestActiveXID;
9445                         TransactionId latestCompletedXid;
9446                         RunningTransactionsData running;
9447
9448                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9449
9450                         /*
9451                          * Construct a RunningTransactions snapshot representing a shut
9452                          * down server, with only prepared transactions still alive. We're
9453                          * never overflowed at this point because all subxids are listed
9454                          * with their parent prepared transactions.
9455                          */
9456                         running.xcnt = nxids;
9457                         running.subxcnt = 0;
9458                         running.subxid_overflow = false;
9459                         running.nextXid = checkPoint.nextXid;
9460                         running.oldestRunningXid = oldestActiveXID;
9461                         latestCompletedXid = checkPoint.nextXid;
9462                         TransactionIdRetreat(latestCompletedXid);
9463                         Assert(TransactionIdIsNormal(latestCompletedXid));
9464                         running.latestCompletedXid = latestCompletedXid;
9465                         running.xids = xids;
9466
9467                         ProcArrayApplyRecoveryInfo(&running);
9468
9469                         StandbyRecoverPreparedTransactions(true);
9470                 }
9471
9472                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9473                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9474                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9475
9476                 /* Update shared-memory copy of checkpoint XID/epoch */
9477                 {
9478                         /* use volatile pointer to prevent code rearrangement */
9479                         volatile XLogCtlData *xlogctl = XLogCtl;
9480
9481                         SpinLockAcquire(&xlogctl->info_lck);
9482                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9483                         xlogctl->ckptXid = checkPoint.nextXid;
9484                         SpinLockRelease(&xlogctl->info_lck);
9485                 }
9486
9487                 /*
9488                  * We should've already switched to the new TLI before replaying this
9489                  * record.
9490                  */
9491                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9492                         ereport(PANIC,
9493                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9494                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9495
9496                 RecoveryRestartPoint(&checkPoint);
9497         }
9498         else if (info == XLOG_CHECKPOINT_ONLINE)
9499         {
9500                 CheckPoint      checkPoint;
9501
9502                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9503                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9504                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9505                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9506                                                                   checkPoint.nextXid))
9507                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9508                 LWLockRelease(XidGenLock);
9509                 /* ... but still treat OID counter as exact */
9510                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9511                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9512                 ShmemVariableCache->oidCount = 0;
9513                 LWLockRelease(OidGenLock);
9514                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9515                                                                   checkPoint.nextMultiOffset);
9516                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9517                                                                   checkPoint.oldestXid))
9518                         SetTransactionIdLimit(checkPoint.oldestXid,
9519                                                                   checkPoint.oldestXidDB);
9520                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9521                                                            checkPoint.oldestMultiDB);
9522
9523                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9524                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9525                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9526
9527                 /* Update shared-memory copy of checkpoint XID/epoch */
9528                 {
9529                         /* use volatile pointer to prevent code rearrangement */
9530                         volatile XLogCtlData *xlogctl = XLogCtl;
9531
9532                         SpinLockAcquire(&xlogctl->info_lck);
9533                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9534                         xlogctl->ckptXid = checkPoint.nextXid;
9535                         SpinLockRelease(&xlogctl->info_lck);
9536                 }
9537
9538                 /* TLI should not change in an on-line checkpoint */
9539                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9540                         ereport(PANIC,
9541                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9542                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9543
9544                 RecoveryRestartPoint(&checkPoint);
9545         }
9546         else if (info == XLOG_END_OF_RECOVERY)
9547         {
9548                 xl_end_of_recovery xlrec;
9549
9550                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9551
9552                 /*
9553                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9554                  * but this case is rarer and harder to test, so the benefit doesn't
9555                  * outweigh the potential extra cost of maintenance.
9556                  */
9557
9558                 /*
9559                  * We should've already switched to the new TLI before replaying this
9560                  * record.
9561                  */
9562                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9563                         ereport(PANIC,
9564                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9565                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9566         }
9567         else if (info == XLOG_NOOP)
9568         {
9569                 /* nothing to do here */
9570         }
9571         else if (info == XLOG_SWITCH)
9572         {
9573                 /* nothing to do here */
9574         }
9575         else if (info == XLOG_RESTORE_POINT)
9576         {
9577                 /* nothing to do here */
9578         }
9579         else if (info == XLOG_FPI)
9580         {
9581                 char       *data;
9582                 BkpBlock        bkpb;
9583
9584                 /*
9585                  * Full-page image (FPI) records contain a backup block stored "inline"
9586                  * in the normal data since the locking when writing hint records isn't
9587                  * sufficient to use the normal backup block mechanism, which assumes
9588                  * exclusive lock on the buffer supplied.
9589                  *
9590                  * Since the only change in these backup block are hint bits, there
9591                  * are no recovery conflicts generated.
9592                  *
9593                  * This also means there is no corresponding API call for this, so an
9594                  * smgr implementation has no need to implement anything. Which means
9595                  * nothing is needed in md.c etc
9596                  */
9597                 data = XLogRecGetData(record);
9598                 memcpy(&bkpb, data, sizeof(BkpBlock));
9599                 data += sizeof(BkpBlock);
9600
9601                 RestoreBackupBlockContents(lsn, bkpb, data, false, false);
9602         }
9603         else if (info == XLOG_BACKUP_END)
9604         {
9605                 XLogRecPtr      startpoint;
9606
9607                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9608
9609                 if (ControlFile->backupStartPoint == startpoint)
9610                 {
9611                         /*
9612                          * We have reached the end of base backup, the point where
9613                          * pg_stop_backup() was done. The data on disk is now consistent.
9614                          * Reset backupStartPoint, and update minRecoveryPoint to make
9615                          * sure we don't allow starting up at an earlier point even if
9616                          * recovery is stopped and restarted soon after this.
9617                          */
9618                         elog(DEBUG1, "end of backup reached");
9619
9620                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9621
9622                         if (ControlFile->minRecoveryPoint < lsn)
9623                         {
9624                                 ControlFile->minRecoveryPoint = lsn;
9625                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9626                         }
9627                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
9628                         ControlFile->backupEndRequired = false;
9629                         UpdateControlFile();
9630
9631                         LWLockRelease(ControlFileLock);
9632                 }
9633         }
9634         else if (info == XLOG_PARAMETER_CHANGE)
9635         {
9636                 xl_parameter_change xlrec;
9637
9638                 /* Update our copy of the parameters in pg_control */
9639                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9640
9641                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9642                 ControlFile->MaxConnections = xlrec.MaxConnections;
9643                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9644                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9645                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9646                 ControlFile->wal_level = xlrec.wal_level;
9647                 ControlFile->wal_log_hints = wal_log_hints;
9648
9649                 /*
9650                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
9651                  * recover back up to this point before allowing hot standby again.
9652                  * This is particularly important if wal_level was set to 'archive'
9653                  * before, and is now 'hot_standby', to ensure you don't run queries
9654                  * against the WAL preceding the wal_level change. Same applies to
9655                  * decreasing max_* settings.
9656                  */
9657                 minRecoveryPoint = ControlFile->minRecoveryPoint;
9658                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9659                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9660                 {
9661                         ControlFile->minRecoveryPoint = lsn;
9662                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9663                 }
9664
9665                 UpdateControlFile();
9666                 LWLockRelease(ControlFileLock);
9667
9668                 /* Check to see if any changes to max_connections give problems */
9669                 CheckRequiredParameterValues();
9670         }
9671         else if (info == XLOG_FPW_CHANGE)
9672         {
9673                 /* use volatile pointer to prevent code rearrangement */
9674                 volatile XLogCtlData *xlogctl = XLogCtl;
9675                 bool            fpw;
9676
9677                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9678
9679                 /*
9680                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9681                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
9682                  * full_page_writes has been disabled during online backup.
9683                  */
9684                 if (!fpw)
9685                 {
9686                         SpinLockAcquire(&xlogctl->info_lck);
9687                         if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
9688                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
9689                         SpinLockRelease(&xlogctl->info_lck);
9690                 }
9691
9692                 /* Keep track of full_page_writes */
9693                 lastFullPageWrites = fpw;
9694         }
9695 }
9696
9697 #ifdef WAL_DEBUG
9698
9699 static void
9700 xlog_outrec(StringInfo buf, XLogRecord *record)
9701 {
9702         int                     i;
9703
9704         appendStringInfo(buf, "prev %X/%X; xid %u",
9705                                          (uint32) (record->xl_prev >> 32),
9706                                          (uint32) record->xl_prev,
9707                                          record->xl_xid);
9708
9709         appendStringInfo(buf, "; len %u",
9710                                          record->xl_len);
9711
9712         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
9713         {
9714                 if (record->xl_info & XLR_BKP_BLOCK(i))
9715                         appendStringInfo(buf, "; bkpb%d", i);
9716         }
9717
9718         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
9719 }
9720 #endif   /* WAL_DEBUG */
9721
9722
9723 /*
9724  * Return the (possible) sync flag used for opening a file, depending on the
9725  * value of the GUC wal_sync_method.
9726  */
9727 static int
9728 get_sync_bit(int method)
9729 {
9730         int                     o_direct_flag = 0;
9731
9732         /* If fsync is disabled, never open in sync mode */
9733         if (!enableFsync)
9734                 return 0;
9735
9736         /*
9737          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9738          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9739          * disabled, otherwise the archive command or walsender process will read
9740          * the WAL soon after writing it, which is guaranteed to cause a physical
9741          * read if we bypassed the kernel cache. We also skip the
9742          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9743          * reason.
9744          *
9745          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9746          * written by walreceiver is normally read by the startup process soon
9747          * after its written. Also, walreceiver performs unaligned writes, which
9748          * don't work with O_DIRECT, so it is required for correctness too.
9749          */
9750         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9751                 o_direct_flag = PG_O_DIRECT;
9752
9753         switch (method)
9754         {
9755                         /*
9756                          * enum values for all sync options are defined even if they are
9757                          * not supported on the current platform.  But if not, they are
9758                          * not included in the enum option array, and therefore will never
9759                          * be seen here.
9760                          */
9761                 case SYNC_METHOD_FSYNC:
9762                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9763                 case SYNC_METHOD_FDATASYNC:
9764                         return 0;
9765 #ifdef OPEN_SYNC_FLAG
9766                 case SYNC_METHOD_OPEN:
9767                         return OPEN_SYNC_FLAG | o_direct_flag;
9768 #endif
9769 #ifdef OPEN_DATASYNC_FLAG
9770                 case SYNC_METHOD_OPEN_DSYNC:
9771                         return OPEN_DATASYNC_FLAG | o_direct_flag;
9772 #endif
9773                 default:
9774                         /* can't happen (unless we are out of sync with option array) */
9775                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
9776                         return 0;                       /* silence warning */
9777         }
9778 }
9779
9780 /*
9781  * GUC support
9782  */
9783 void
9784 assign_xlog_sync_method(int new_sync_method, void *extra)
9785 {
9786         if (sync_method != new_sync_method)
9787         {
9788                 /*
9789                  * To ensure that no blocks escape unsynced, force an fsync on the
9790                  * currently open log segment (if any).  Also, if the open flag is
9791                  * changing, close the log file so it will be reopened (with new flag
9792                  * bit) at next use.
9793                  */
9794                 if (openLogFile >= 0)
9795                 {
9796                         if (pg_fsync(openLogFile) != 0)
9797                                 ereport(PANIC,
9798                                                 (errcode_for_file_access(),
9799                                                  errmsg("could not fsync log segment %s: %m",
9800                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
9801                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9802                                 XLogFileClose();
9803                 }
9804         }
9805 }
9806
9807
9808 /*
9809  * Issue appropriate kind of fsync (if any) for an XLOG output file.
9810  *
9811  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9812  * 'log' and 'seg' are for error reporting purposes.
9813  */
9814 void
9815 issue_xlog_fsync(int fd, XLogSegNo segno)
9816 {
9817         switch (sync_method)
9818         {
9819                 case SYNC_METHOD_FSYNC:
9820                         if (pg_fsync_no_writethrough(fd) != 0)
9821                                 ereport(PANIC,
9822                                                 (errcode_for_file_access(),
9823                                                  errmsg("could not fsync log file %s: %m",
9824                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9825                         break;
9826 #ifdef HAVE_FSYNC_WRITETHROUGH
9827                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9828                         if (pg_fsync_writethrough(fd) != 0)
9829                                 ereport(PANIC,
9830                                                 (errcode_for_file_access(),
9831                                           errmsg("could not fsync write-through log file %s: %m",
9832                                                          XLogFileNameP(ThisTimeLineID, segno))));
9833                         break;
9834 #endif
9835 #ifdef HAVE_FDATASYNC
9836                 case SYNC_METHOD_FDATASYNC:
9837                         if (pg_fdatasync(fd) != 0)
9838                                 ereport(PANIC,
9839                                                 (errcode_for_file_access(),
9840                                                  errmsg("could not fdatasync log file %s: %m",
9841                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9842                         break;
9843 #endif
9844                 case SYNC_METHOD_OPEN:
9845                 case SYNC_METHOD_OPEN_DSYNC:
9846                         /* write synced it already */
9847                         break;
9848                 default:
9849                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9850                         break;
9851         }
9852 }
9853
9854 /*
9855  * Return the filename of given log segment, as a palloc'd string.
9856  */
9857 char *
9858 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
9859 {
9860         char       *result = palloc(MAXFNAMELEN);
9861
9862         XLogFileName(result, tli, segno);
9863         return result;
9864 }
9865
9866 /*
9867  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9868  * function. It creates the necessary starting checkpoint and constructs the
9869  * backup label file.
9870  *
9871  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9872  * backup is started with pg_start_backup(), and there can be only one active
9873  * at a time. The backup label file of an exclusive backup is written to
9874  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9875  *
9876  * A non-exclusive backup is used for the streaming base backups (see
9877  * src/backend/replication/basebackup.c). The difference to exclusive backups
9878  * is that the backup label file is not written to disk. Instead, its would-be
9879  * contents are returned in *labelfile, and the caller is responsible for
9880  * including it in the backup archive as 'backup_label'. There can be many
9881  * non-exclusive backups active at the same time, and they don't conflict
9882  * with an exclusive backup either.
9883  *
9884  * Returns the minimum WAL position that must be present to restore from this
9885  * backup, and the corresponding timeline ID in *starttli_p.
9886  *
9887  * Every successfully started non-exclusive backup must be stopped by calling
9888  * do_pg_stop_backup() or do_pg_abort_backup().
9889  *
9890  * It is the responsibility of the caller of this function to verify the
9891  * permissions of the calling user!
9892  */
9893 XLogRecPtr
9894 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
9895                                    char **labelfile)
9896 {
9897         bool            exclusive = (labelfile == NULL);
9898         bool            backup_started_in_recovery = false;
9899         XLogRecPtr      checkpointloc;
9900         XLogRecPtr      startpoint;
9901         TimeLineID      starttli;
9902         pg_time_t       stamp_time;
9903         char            strfbuf[128];
9904         char            xlogfilename[MAXFNAMELEN];
9905         XLogSegNo       _logSegNo;
9906         struct stat stat_buf;
9907         FILE       *fp;
9908         StringInfoData labelfbuf;
9909
9910         backup_started_in_recovery = RecoveryInProgress();
9911
9912         /*
9913          * Currently only non-exclusive backup can be taken during recovery.
9914          */
9915         if (backup_started_in_recovery && exclusive)
9916                 ereport(ERROR,
9917                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9918                                  errmsg("recovery is in progress"),
9919                                  errhint("WAL control functions cannot be executed during recovery.")));
9920
9921         /*
9922          * During recovery, we don't need to check WAL level. Because, if WAL
9923          * level is not sufficient, it's impossible to get here during recovery.
9924          */
9925         if (!backup_started_in_recovery && !XLogIsNeeded())
9926                 ereport(ERROR,
9927                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9928                           errmsg("WAL level not sufficient for making an online backup"),
9929                                  errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
9930
9931         if (strlen(backupidstr) > MAXPGPATH)
9932                 ereport(ERROR,
9933                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9934                                  errmsg("backup label too long (max %d bytes)",
9935                                                 MAXPGPATH)));
9936
9937         /*
9938          * Mark backup active in shared memory.  We must do full-page WAL writes
9939          * during an on-line backup even if not doing so at other times, because
9940          * it's quite possible for the backup dump to obtain a "torn" (partially
9941          * written) copy of a database page if it reads the page concurrently with
9942          * our write to the same page.  This can be fixed as long as the first
9943          * write to the page in the WAL sequence is a full-page write. Hence, we
9944          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9945          * are no dirty pages in shared memory that might get dumped while the
9946          * backup is in progress without having a corresponding WAL record.  (Once
9947          * the backup is complete, we need not force full-page writes anymore,
9948          * since we expect that any pages not modified during the backup interval
9949          * must have been correctly captured by the backup.)
9950          *
9951          * Note that forcePageWrites has no effect during an online backup from
9952          * the standby.
9953          *
9954          * We must hold all the insertion slots to change the value of
9955          * forcePageWrites, to ensure adequate interlocking against XLogInsert().
9956          */
9957         WALInsertSlotAcquire(true);
9958         if (exclusive)
9959         {
9960                 if (XLogCtl->Insert.exclusiveBackup)
9961                 {
9962                         WALInsertSlotRelease();
9963                         ereport(ERROR,
9964                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9965                                          errmsg("a backup is already in progress"),
9966                                          errhint("Run pg_stop_backup() and try again.")));
9967                 }
9968                 XLogCtl->Insert.exclusiveBackup = true;
9969         }
9970         else
9971                 XLogCtl->Insert.nonExclusiveBackups++;
9972         XLogCtl->Insert.forcePageWrites = true;
9973         WALInsertSlotRelease();
9974
9975         /* Ensure we release forcePageWrites if fail below */
9976         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9977         {
9978                 bool            gotUniqueStartpoint = false;
9979
9980                 /*
9981                  * Force an XLOG file switch before the checkpoint, to ensure that the
9982                  * WAL segment the checkpoint is written to doesn't contain pages with
9983                  * old timeline IDs.  That would otherwise happen if you called
9984                  * pg_start_backup() right after restoring from a PITR archive: the
9985                  * first WAL segment containing the startup checkpoint has pages in
9986                  * the beginning with the old timeline ID.      That can cause trouble at
9987                  * recovery: we won't have a history file covering the old timeline if
9988                  * pg_xlog directory was not included in the base backup and the WAL
9989                  * archive was cleared too before starting the backup.
9990                  *
9991                  * This also ensures that we have emitted a WAL page header that has
9992                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9993                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
9994                  * compress out removable backup blocks, it won't remove any that
9995                  * occur after this point.
9996                  *
9997                  * During recovery, we skip forcing XLOG file switch, which means that
9998                  * the backup taken during recovery is not available for the special
9999                  * recovery case described above.
10000                  */
10001                 if (!backup_started_in_recovery)
10002                         RequestXLogSwitch();
10003
10004                 do
10005                 {
10006                         bool            checkpointfpw;
10007
10008                         /*
10009                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
10010                          * page problems, this guarantees that two successive backup runs
10011                          * will have different checkpoint positions and hence different
10012                          * history file names, even if nothing happened in between.
10013                          *
10014                          * During recovery, establish a restartpoint if possible. We use
10015                          * the last restartpoint as the backup starting checkpoint. This
10016                          * means that two successive backup runs can have same checkpoint
10017                          * positions.
10018                          *
10019                          * Since the fact that we are executing do_pg_start_backup()
10020                          * during recovery means that checkpointer is running, we can use
10021                          * RequestCheckpoint() to establish a restartpoint.
10022                          *
10023                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10024                          * passing fast = true).  Otherwise this can take awhile.
10025                          */
10026                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10027                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
10028
10029                         /*
10030                          * Now we need to fetch the checkpoint record location, and also
10031                          * its REDO pointer.  The oldest point in WAL that would be needed
10032                          * to restore starting from the checkpoint is precisely the REDO
10033                          * pointer.
10034                          */
10035                         LWLockAcquire(ControlFileLock, LW_SHARED);
10036                         checkpointloc = ControlFile->checkPoint;
10037                         startpoint = ControlFile->checkPointCopy.redo;
10038                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10039                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10040                         LWLockRelease(ControlFileLock);
10041
10042                         if (backup_started_in_recovery)
10043                         {
10044                                 /* use volatile pointer to prevent code rearrangement */
10045                                 volatile XLogCtlData *xlogctl = XLogCtl;
10046                                 XLogRecPtr      recptr;
10047
10048                                 /*
10049                                  * Check to see if all WAL replayed during online backup
10050                                  * (i.e., since last restartpoint used as backup starting
10051                                  * checkpoint) contain full-page writes.
10052                                  */
10053                                 SpinLockAcquire(&xlogctl->info_lck);
10054                                 recptr = xlogctl->lastFpwDisableRecPtr;
10055                                 SpinLockRelease(&xlogctl->info_lck);
10056
10057                                 if (!checkpointfpw || startpoint <= recptr)
10058                                         ereport(ERROR,
10059                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10060                                                    errmsg("WAL generated with full_page_writes=off was replayed "
10061                                                                   "since last restartpoint"),
10062                                                    errhint("This means that the backup being taken on the standby "
10063                                                                    "is corrupt and should not be used. "
10064                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
10065                                                                    "and then try an online backup again.")));
10066
10067                                 /*
10068                                  * During recovery, since we don't use the end-of-backup WAL
10069                                  * record and don't write the backup history file, the
10070                                  * starting WAL location doesn't need to be unique. This means
10071                                  * that two base backups started at the same time might use
10072                                  * the same checkpoint as starting locations.
10073                                  */
10074                                 gotUniqueStartpoint = true;
10075                         }
10076
10077                         /*
10078                          * If two base backups are started at the same time (in WAL sender
10079                          * processes), we need to make sure that they use different
10080                          * checkpoints as starting locations, because we use the starting
10081                          * WAL location as a unique identifier for the base backup in the
10082                          * end-of-backup WAL record and when we write the backup history
10083                          * file. Perhaps it would be better generate a separate unique ID
10084                          * for each backup instead of forcing another checkpoint, but
10085                          * taking a checkpoint right after another is not that expensive
10086                          * either because only few buffers have been dirtied yet.
10087                          */
10088                         WALInsertSlotAcquire(true);
10089                         if (XLogCtl->Insert.lastBackupStart < startpoint)
10090                         {
10091                                 XLogCtl->Insert.lastBackupStart = startpoint;
10092                                 gotUniqueStartpoint = true;
10093                         }
10094                         WALInsertSlotRelease();
10095                 } while (!gotUniqueStartpoint);
10096
10097                 XLByteToSeg(startpoint, _logSegNo);
10098                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
10099
10100                 /*
10101                  * Construct backup label file
10102                  */
10103                 initStringInfo(&labelfbuf);
10104
10105                 /* Use the log timezone here, not the session timezone */
10106                 stamp_time = (pg_time_t) time(NULL);
10107                 pg_strftime(strfbuf, sizeof(strfbuf),
10108                                         "%Y-%m-%d %H:%M:%S %Z",
10109                                         pg_localtime(&stamp_time, log_timezone));
10110                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
10111                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10112                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
10113                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10114                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
10115                                                  exclusive ? "pg_start_backup" : "streamed");
10116                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
10117                                                  backup_started_in_recovery ? "standby" : "master");
10118                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
10119                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
10120
10121                 /*
10122                  * Okay, write the file, or return its contents to caller.
10123                  */
10124                 if (exclusive)
10125                 {
10126                         /*
10127                          * Check for existing backup label --- implies a backup is already
10128                          * running.  (XXX given that we checked exclusiveBackup above,
10129                          * maybe it would be OK to just unlink any such label file?)
10130                          */
10131                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10132                         {
10133                                 if (errno != ENOENT)
10134                                         ereport(ERROR,
10135                                                         (errcode_for_file_access(),
10136                                                          errmsg("could not stat file \"%s\": %m",
10137                                                                         BACKUP_LABEL_FILE)));
10138                         }
10139                         else
10140                                 ereport(ERROR,
10141                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10142                                                  errmsg("a backup is already in progress"),
10143                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10144                                                                  BACKUP_LABEL_FILE)));
10145
10146                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10147
10148                         if (!fp)
10149                                 ereport(ERROR,
10150                                                 (errcode_for_file_access(),
10151                                                  errmsg("could not create file \"%s\": %m",
10152                                                                 BACKUP_LABEL_FILE)));
10153                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
10154                                 fflush(fp) != 0 ||
10155                                 pg_fsync(fileno(fp)) != 0 ||
10156                                 ferror(fp) ||
10157                                 FreeFile(fp))
10158                                 ereport(ERROR,
10159                                                 (errcode_for_file_access(),
10160                                                  errmsg("could not write file \"%s\": %m",
10161                                                                 BACKUP_LABEL_FILE)));
10162                         pfree(labelfbuf.data);
10163                 }
10164                 else
10165                         *labelfile = labelfbuf.data;
10166         }
10167         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10168
10169         /*
10170          * We're done.  As a convenience, return the starting WAL location.
10171          */
10172         if (starttli_p)
10173                 *starttli_p = starttli;
10174         return startpoint;
10175 }
10176
10177 /* Error cleanup callback for pg_start_backup */
10178 static void
10179 pg_start_backup_callback(int code, Datum arg)
10180 {
10181         bool            exclusive = DatumGetBool(arg);
10182
10183         /* Update backup counters and forcePageWrites on failure */
10184         WALInsertSlotAcquire(true);
10185         if (exclusive)
10186         {
10187                 Assert(XLogCtl->Insert.exclusiveBackup);
10188                 XLogCtl->Insert.exclusiveBackup = false;
10189         }
10190         else
10191         {
10192                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10193                 XLogCtl->Insert.nonExclusiveBackups--;
10194         }
10195
10196         if (!XLogCtl->Insert.exclusiveBackup &&
10197                 XLogCtl->Insert.nonExclusiveBackups == 0)
10198         {
10199                 XLogCtl->Insert.forcePageWrites = false;
10200         }
10201         WALInsertSlotRelease();
10202 }
10203
10204 /*
10205  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10206  * function.
10207
10208  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10209  * the non-exclusive backup specified by 'labelfile'.
10210  *
10211  * Returns the last WAL position that must be present to restore from this
10212  * backup, and the corresponding timeline ID in *stoptli_p.
10213  *
10214  * It is the responsibility of the caller of this function to verify the
10215  * permissions of the calling user!
10216  */
10217 XLogRecPtr
10218 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10219 {
10220         bool            exclusive = (labelfile == NULL);
10221         bool            backup_started_in_recovery = false;
10222         XLogRecPtr      startpoint;
10223         XLogRecPtr      stoppoint;
10224         TimeLineID      stoptli;
10225         XLogRecData rdata;
10226         pg_time_t       stamp_time;
10227         char            strfbuf[128];
10228         char            histfilepath[MAXPGPATH];
10229         char            startxlogfilename[MAXFNAMELEN];
10230         char            stopxlogfilename[MAXFNAMELEN];
10231         char            lastxlogfilename[MAXFNAMELEN];
10232         char            histfilename[MAXFNAMELEN];
10233         char            backupfrom[20];
10234         XLogSegNo       _logSegNo;
10235         FILE       *lfp;
10236         FILE       *fp;
10237         char            ch;
10238         int                     seconds_before_warning;
10239         int                     waits = 0;
10240         bool            reported_waiting = false;
10241         char       *remaining;
10242         char       *ptr;
10243         uint32          hi,
10244                                 lo;
10245
10246         backup_started_in_recovery = RecoveryInProgress();
10247
10248         /*
10249          * Currently only non-exclusive backup can be taken during recovery.
10250          */
10251         if (backup_started_in_recovery && exclusive)
10252                 ereport(ERROR,
10253                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10254                                  errmsg("recovery is in progress"),
10255                                  errhint("WAL control functions cannot be executed during recovery.")));
10256
10257         /*
10258          * During recovery, we don't need to check WAL level. Because, if WAL
10259          * level is not sufficient, it's impossible to get here during recovery.
10260          */
10261         if (!backup_started_in_recovery && !XLogIsNeeded())
10262                 ereport(ERROR,
10263                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10264                           errmsg("WAL level not sufficient for making an online backup"),
10265                                  errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
10266
10267         /*
10268          * OK to update backup counters and forcePageWrites
10269          */
10270         WALInsertSlotAcquire(true);
10271         if (exclusive)
10272                 XLogCtl->Insert.exclusiveBackup = false;
10273         else
10274         {
10275                 /*
10276                  * The user-visible pg_start/stop_backup() functions that operate on
10277                  * exclusive backups can be called at any time, but for non-exclusive
10278                  * backups, it is expected that each do_pg_start_backup() call is
10279                  * matched by exactly one do_pg_stop_backup() call.
10280                  */
10281                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10282                 XLogCtl->Insert.nonExclusiveBackups--;
10283         }
10284
10285         if (!XLogCtl->Insert.exclusiveBackup &&
10286                 XLogCtl->Insert.nonExclusiveBackups == 0)
10287         {
10288                 XLogCtl->Insert.forcePageWrites = false;
10289         }
10290         WALInsertSlotRelease();
10291
10292         if (exclusive)
10293         {
10294                 /*
10295                  * Read the existing label file into memory.
10296                  */
10297                 struct stat statbuf;
10298                 int                     r;
10299
10300                 if (stat(BACKUP_LABEL_FILE, &statbuf))
10301                 {
10302                         if (errno != ENOENT)
10303                                 ereport(ERROR,
10304                                                 (errcode_for_file_access(),
10305                                                  errmsg("could not stat file \"%s\": %m",
10306                                                                 BACKUP_LABEL_FILE)));
10307                         ereport(ERROR,
10308                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10309                                          errmsg("a backup is not in progress")));
10310                 }
10311
10312                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10313                 if (!lfp)
10314                 {
10315                         ereport(ERROR,
10316                                         (errcode_for_file_access(),
10317                                          errmsg("could not read file \"%s\": %m",
10318                                                         BACKUP_LABEL_FILE)));
10319                 }
10320                 labelfile = palloc(statbuf.st_size + 1);
10321                 r = fread(labelfile, statbuf.st_size, 1, lfp);
10322                 labelfile[statbuf.st_size] = '\0';
10323
10324                 /*
10325                  * Close and remove the backup label file
10326                  */
10327                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
10328                         ereport(ERROR,
10329                                         (errcode_for_file_access(),
10330                                          errmsg("could not read file \"%s\": %m",
10331                                                         BACKUP_LABEL_FILE)));
10332                 if (unlink(BACKUP_LABEL_FILE) != 0)
10333                         ereport(ERROR,
10334                                         (errcode_for_file_access(),
10335                                          errmsg("could not remove file \"%s\": %m",
10336                                                         BACKUP_LABEL_FILE)));
10337         }
10338
10339         /*
10340          * Read and parse the START WAL LOCATION line (this code is pretty crude,
10341          * but we are not expecting any variability in the file format).
10342          */
10343         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10344                            &hi, &lo, startxlogfilename,
10345                            &ch) != 4 || ch != '\n')
10346                 ereport(ERROR,
10347                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10348                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10349         startpoint = ((uint64) hi) << 32 | lo;
10350         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
10351
10352         /*
10353          * Parse the BACKUP FROM line. If we are taking an online backup from the
10354          * standby, we confirm that the standby has not been promoted during the
10355          * backup.
10356          */
10357         ptr = strstr(remaining, "BACKUP FROM:");
10358         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10359                 ereport(ERROR,
10360                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10361                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10362         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10363                 ereport(ERROR,
10364                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10365                                  errmsg("the standby was promoted during online backup"),
10366                                  errhint("This means that the backup being taken is corrupt "
10367                                                  "and should not be used. "
10368                                                  "Try taking another online backup.")));
10369
10370         /*
10371          * During recovery, we don't write an end-of-backup record. We assume that
10372          * pg_control was backed up last and its minimum recovery point can be
10373          * available as the backup end location. Since we don't have an
10374          * end-of-backup record, we use the pg_control value to check whether
10375          * we've reached the end of backup when starting recovery from this
10376          * backup. We have no way of checking if pg_control wasn't backed up last
10377          * however.
10378          *
10379          * We don't force a switch to new WAL file and wait for all the required
10380          * files to be archived. This is okay if we use the backup to start the
10381          * standby. But, if it's for an archive recovery, to ensure all the
10382          * required files are available, a user should wait for them to be
10383          * archived, or include them into the backup.
10384          *
10385          * We return the current minimum recovery point as the backup end
10386          * location. Note that it can be greater than the exact backup end
10387          * location if the minimum recovery point is updated after the backup of
10388          * pg_control. This is harmless for current uses.
10389          *
10390          * XXX currently a backup history file is for informational and debug
10391          * purposes only. It's not essential for an online backup. Furthermore,
10392          * even if it's created, it will not be archived during recovery because
10393          * an archiver is not invoked. So it doesn't seem worthwhile to write a
10394          * backup history file during recovery.
10395          */
10396         if (backup_started_in_recovery)
10397         {
10398                 /* use volatile pointer to prevent code rearrangement */
10399                 volatile XLogCtlData *xlogctl = XLogCtl;
10400                 XLogRecPtr      recptr;
10401
10402                 /*
10403                  * Check to see if all WAL replayed during online backup contain
10404                  * full-page writes.
10405                  */
10406                 SpinLockAcquire(&xlogctl->info_lck);
10407                 recptr = xlogctl->lastFpwDisableRecPtr;
10408                 SpinLockRelease(&xlogctl->info_lck);
10409
10410                 if (startpoint <= recptr)
10411                         ereport(ERROR,
10412                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10413                            errmsg("WAL generated with full_page_writes=off was replayed "
10414                                           "during online backup"),
10415                          errhint("This means that the backup being taken on the standby "
10416                                          "is corrupt and should not be used. "
10417                                  "Enable full_page_writes and run CHECKPOINT on the master, "
10418                                          "and then try an online backup again.")));
10419
10420
10421                 LWLockAcquire(ControlFileLock, LW_SHARED);
10422                 stoppoint = ControlFile->minRecoveryPoint;
10423                 stoptli = ControlFile->minRecoveryPointTLI;
10424                 LWLockRelease(ControlFileLock);
10425
10426                 if (stoptli_p)
10427                         *stoptli_p = stoptli;
10428                 return stoppoint;
10429         }
10430
10431         /*
10432          * Write the backup-end xlog record
10433          */
10434         rdata.data = (char *) (&startpoint);
10435         rdata.len = sizeof(startpoint);
10436         rdata.buffer = InvalidBuffer;
10437         rdata.next = NULL;
10438         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
10439         stoptli = ThisTimeLineID;
10440
10441         /*
10442          * Force a switch to a new xlog segment file, so that the backup is valid
10443          * as soon as archiver moves out the current segment file.
10444          */
10445         RequestXLogSwitch();
10446
10447         XLByteToPrevSeg(stoppoint, _logSegNo);
10448         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10449
10450         /* Use the log timezone here, not the session timezone */
10451         stamp_time = (pg_time_t) time(NULL);
10452         pg_strftime(strfbuf, sizeof(strfbuf),
10453                                 "%Y-%m-%d %H:%M:%S %Z",
10454                                 pg_localtime(&stamp_time, log_timezone));
10455
10456         /*
10457          * Write the backup history file
10458          */
10459         XLByteToSeg(startpoint, _logSegNo);
10460         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10461                                                   (uint32) (startpoint % XLogSegSize));
10462         fp = AllocateFile(histfilepath, "w");
10463         if (!fp)
10464                 ereport(ERROR,
10465                                 (errcode_for_file_access(),
10466                                  errmsg("could not create file \"%s\": %m",
10467                                                 histfilepath)));
10468         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10469                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10470         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10471                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10472         /* transfer remaining lines from label to history file */
10473         fprintf(fp, "%s", remaining);
10474         fprintf(fp, "STOP TIME: %s\n", strfbuf);
10475         if (fflush(fp) || ferror(fp) || FreeFile(fp))
10476                 ereport(ERROR,
10477                                 (errcode_for_file_access(),
10478                                  errmsg("could not write file \"%s\": %m",
10479                                                 histfilepath)));
10480
10481         /*
10482          * Clean out any no-longer-needed history files.  As a side effect, this
10483          * will post a .ready file for the newly created history file, notifying
10484          * the archiver that history file may be archived immediately.
10485          */
10486         CleanupBackupHistory();
10487
10488         /*
10489          * If archiving is enabled, wait for all the required WAL files to be
10490          * archived before returning. If archiving isn't enabled, the required WAL
10491          * needs to be transported via streaming replication (hopefully with
10492          * wal_keep_segments set high enough), or some more exotic mechanism like
10493          * polling and copying files from pg_xlog with script. We have no
10494          * knowledge of those mechanisms, so it's up to the user to ensure that he
10495          * gets all the required WAL.
10496          *
10497          * We wait until both the last WAL file filled during backup and the
10498          * history file have been archived, and assume that the alphabetic sorting
10499          * property of the WAL files ensures any earlier WAL files are safely
10500          * archived as well.
10501          *
10502          * We wait forever, since archive_command is supposed to work and we
10503          * assume the admin wanted his backup to work completely. If you don't
10504          * wish to wait, you can set statement_timeout.  Also, some notices are
10505          * issued to clue in anyone who might be doing this interactively.
10506          */
10507         if (waitforarchive && XLogArchivingActive())
10508         {
10509                 XLByteToPrevSeg(stoppoint, _logSegNo);
10510                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10511
10512                 XLByteToSeg(startpoint, _logSegNo);
10513                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10514                                                           (uint32) (startpoint % XLogSegSize));
10515
10516                 seconds_before_warning = 60;
10517                 waits = 0;
10518
10519                 while (XLogArchiveIsBusy(lastxlogfilename) ||
10520                            XLogArchiveIsBusy(histfilename))
10521                 {
10522                         CHECK_FOR_INTERRUPTS();
10523
10524                         if (!reported_waiting && waits > 5)
10525                         {
10526                                 ereport(NOTICE,
10527                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10528                                 reported_waiting = true;
10529                         }
10530
10531                         pg_usleep(1000000L);
10532
10533                         if (++waits >= seconds_before_warning)
10534                         {
10535                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
10536                                 ereport(WARNING,
10537                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10538                                                                 waits),
10539                                                  errhint("Check that your archive_command is executing properly.  "
10540                                                                  "pg_stop_backup can be canceled safely, "
10541                                                                  "but the database backup will not be usable without all the WAL segments.")));
10542                         }
10543                 }
10544
10545                 ereport(NOTICE,
10546                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
10547         }
10548         else if (waitforarchive)
10549                 ereport(NOTICE,
10550                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10551
10552         /*
10553          * We're done.  As a convenience, return the ending WAL location.
10554          */
10555         if (stoptli_p)
10556                 *stoptli_p = stoptli;
10557         return stoppoint;
10558 }
10559
10560
10561 /*
10562  * do_pg_abort_backup: abort a running backup
10563  *
10564  * This does just the most basic steps of do_pg_stop_backup(), by taking the
10565  * system out of backup mode, thus making it a lot more safe to call from
10566  * an error handler.
10567  *
10568  * NB: This is only for aborting a non-exclusive backup that doesn't write
10569  * backup_label. A backup started with pg_stop_backup() needs to be finished
10570  * with pg_stop_backup().
10571  */
10572 void
10573 do_pg_abort_backup(void)
10574 {
10575         WALInsertSlotAcquire(true);
10576         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10577         XLogCtl->Insert.nonExclusiveBackups--;
10578
10579         if (!XLogCtl->Insert.exclusiveBackup &&
10580                 XLogCtl->Insert.nonExclusiveBackups == 0)
10581         {
10582                 XLogCtl->Insert.forcePageWrites = false;
10583         }
10584         WALInsertSlotRelease();
10585 }
10586
10587 /*
10588  * Get latest redo apply position.
10589  *
10590  * Exported to allow WALReceiver to read the pointer directly.
10591  */
10592 XLogRecPtr
10593 GetXLogReplayRecPtr(TimeLineID *replayTLI)
10594 {
10595         /* use volatile pointer to prevent code rearrangement */
10596         volatile XLogCtlData *xlogctl = XLogCtl;
10597         XLogRecPtr      recptr;
10598         TimeLineID      tli;
10599
10600         SpinLockAcquire(&xlogctl->info_lck);
10601         recptr = xlogctl->lastReplayedEndRecPtr;
10602         tli = xlogctl->lastReplayedTLI;
10603         SpinLockRelease(&xlogctl->info_lck);
10604
10605         if (replayTLI)
10606                 *replayTLI = tli;
10607         return recptr;
10608 }
10609
10610 /*
10611  * Get latest WAL insert pointer
10612  */
10613 XLogRecPtr
10614 GetXLogInsertRecPtr(void)
10615 {
10616         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
10617         uint64          current_bytepos;
10618
10619         SpinLockAcquire(&Insert->insertpos_lck);
10620         current_bytepos = Insert->CurrBytePos;
10621         SpinLockRelease(&Insert->insertpos_lck);
10622
10623         return XLogBytePosToRecPtr(current_bytepos);
10624 }
10625
10626 /*
10627  * Get latest WAL write pointer
10628  */
10629 XLogRecPtr
10630 GetXLogWriteRecPtr(void)
10631 {
10632         {
10633                 /* use volatile pointer to prevent code rearrangement */
10634                 volatile XLogCtlData *xlogctl = XLogCtl;
10635
10636                 SpinLockAcquire(&xlogctl->info_lck);
10637                 LogwrtResult = xlogctl->LogwrtResult;
10638                 SpinLockRelease(&xlogctl->info_lck);
10639         }
10640
10641         return LogwrtResult.Write;
10642 }
10643
10644 /*
10645  * Returns the redo pointer of the last checkpoint or restartpoint. This is
10646  * the oldest point in WAL that we still need, if we have to restart recovery.
10647  */
10648 void
10649 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
10650 {
10651         LWLockAcquire(ControlFileLock, LW_SHARED);
10652         *oldrecptr = ControlFile->checkPointCopy.redo;
10653         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
10654         LWLockRelease(ControlFileLock);
10655 }
10656
10657 /*
10658  * read_backup_label: check to see if a backup_label file is present
10659  *
10660  * If we see a backup_label during recovery, we assume that we are recovering
10661  * from a backup dump file, and we therefore roll forward from the checkpoint
10662  * identified by the label file, NOT what pg_control says.      This avoids the
10663  * problem that pg_control might have been archived one or more checkpoints
10664  * later than the start of the dump, and so if we rely on it as the start
10665  * point, we will fail to restore a consistent database state.
10666  *
10667  * Returns TRUE if a backup_label was found (and fills the checkpoint
10668  * location and its REDO location into *checkPointLoc and RedoStartLSN,
10669  * respectively); returns FALSE if not. If this backup_label came from a
10670  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
10671  * was created during recovery, *backupFromStandby is set to TRUE.
10672  */
10673 static bool
10674 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
10675                                   bool *backupFromStandby)
10676 {
10677         char            startxlogfilename[MAXFNAMELEN];
10678         TimeLineID      tli;
10679         FILE       *lfp;
10680         char            ch;
10681         char            backuptype[20];
10682         char            backupfrom[20];
10683         uint32          hi,
10684                                 lo;
10685
10686         *backupEndRequired = false;
10687         *backupFromStandby = false;
10688
10689         /*
10690          * See if label file is present
10691          */
10692         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10693         if (!lfp)
10694         {
10695                 if (errno != ENOENT)
10696                         ereport(FATAL,
10697                                         (errcode_for_file_access(),
10698                                          errmsg("could not read file \"%s\": %m",
10699                                                         BACKUP_LABEL_FILE)));
10700                 return false;                   /* it's not there, all is fine */
10701         }
10702
10703         /*
10704          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
10705          * is pretty crude, but we are not expecting any variability in the file
10706          * format).
10707          */
10708         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
10709                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
10710                 ereport(FATAL,
10711                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10712                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10713         RedoStartLSN = ((uint64) hi) << 32 | lo;
10714         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
10715                            &hi, &lo, &ch) != 3 || ch != '\n')
10716                 ereport(FATAL,
10717                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10718                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10719         *checkPointLoc = ((uint64) hi) << 32 | lo;
10720
10721         /*
10722          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
10723          * from an older backup anyway, but since the information on it is not
10724          * strictly required, don't error out if it's missing for some reason.
10725          */
10726         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
10727         {
10728                 if (strcmp(backuptype, "streamed") == 0)
10729                         *backupEndRequired = true;
10730         }
10731
10732         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
10733         {
10734                 if (strcmp(backupfrom, "standby") == 0)
10735                         *backupFromStandby = true;
10736         }
10737
10738         if (ferror(lfp) || FreeFile(lfp))
10739                 ereport(FATAL,
10740                                 (errcode_for_file_access(),
10741                                  errmsg("could not read file \"%s\": %m",
10742                                                 BACKUP_LABEL_FILE)));
10743
10744         return true;
10745 }
10746
10747 /*
10748  * Error context callback for errors occurring during rm_redo().
10749  */
10750 static void
10751 rm_redo_error_callback(void *arg)
10752 {
10753         XLogRecord *record = (XLogRecord *) arg;
10754         StringInfoData buf;
10755
10756         initStringInfo(&buf);
10757         RmgrTable[record->xl_rmid].rm_desc(&buf,
10758                                                                            record->xl_info,
10759                                                                            XLogRecGetData(record));
10760
10761         /* don't bother emitting empty description */
10762         if (buf.len > 0)
10763                 errcontext("xlog redo %s", buf.data);
10764
10765         pfree(buf.data);
10766 }
10767
10768 /*
10769  * BackupInProgress: check if online backup mode is active
10770  *
10771  * This is done by checking for existence of the "backup_label" file.
10772  */
10773 bool
10774 BackupInProgress(void)
10775 {
10776         struct stat stat_buf;
10777
10778         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
10779 }
10780
10781 /*
10782  * CancelBackup: rename the "backup_label" file to cancel backup mode
10783  *
10784  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
10785  * Note that this will render an online backup in progress useless.
10786  * To correctly finish an online backup, pg_stop_backup must be called.
10787  */
10788 void
10789 CancelBackup(void)
10790 {
10791         struct stat stat_buf;
10792
10793         /* if the file is not there, return */
10794         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
10795                 return;
10796
10797         /* remove leftover file from previously canceled backup if it exists */
10798         unlink(BACKUP_LABEL_OLD);
10799
10800         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
10801         {
10802                 ereport(LOG,
10803                                 (errmsg("online backup mode canceled"),
10804                                  errdetail("\"%s\" was renamed to \"%s\".",
10805                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10806         }
10807         else
10808         {
10809                 ereport(WARNING,
10810                                 (errcode_for_file_access(),
10811                                  errmsg("online backup mode was not canceled"),
10812                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
10813                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10814         }
10815 }
10816
10817 /*
10818  * Read the XLOG page containing RecPtr into readBuf (if not read already).
10819  * Returns number of bytes read, if the page is read successfully, or -1
10820  * in case of errors.  When errors occur, they are ereport'ed, but only
10821  * if they have not been previously reported.
10822  *
10823  * This is responsible for restoring files from archive as needed, as well
10824  * as for waiting for the requested WAL record to arrive in standby mode.
10825  *
10826  * 'emode' specifies the log level used for reporting "file not found" or
10827  * "end of WAL" situations in archive recovery, or in standby mode when a
10828  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10829  * false in those situations, on higher log levels the ereport() won't
10830  * return.
10831  *
10832  * In standby mode, if after a successful return of XLogPageRead() the
10833  * caller finds the record it's interested in to be broken, it should
10834  * ereport the error with the level determined by
10835  * emode_for_corrupt_record(), and then set lastSourceFailed
10836  * and call XLogPageRead() again with the same arguments. This lets
10837  * XLogPageRead() to try fetching the record from another source, or to
10838  * sleep and retry.
10839  */
10840 static int
10841 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
10842                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
10843 {
10844         XLogPageReadPrivate *private =
10845         (XLogPageReadPrivate *) xlogreader->private_data;
10846         int                     emode = private->emode;
10847         uint32          targetPageOff;
10848         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
10849
10850         XLByteToSeg(targetPagePtr, targetSegNo);
10851         targetPageOff = targetPagePtr % XLogSegSize;
10852
10853         /*
10854          * See if we need to switch to a new segment because the requested record
10855          * is not in the currently open one.
10856          */
10857         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
10858         {
10859                 /*
10860                  * Request a restartpoint if we've replayed too much xlog since the
10861                  * last one.
10862                  */
10863                 if (StandbyModeRequested && bgwriterLaunched)
10864                 {
10865                         if (XLogCheckpointNeeded(readSegNo))
10866                         {
10867                                 (void) GetRedoRecPtr();
10868                                 if (XLogCheckpointNeeded(readSegNo))
10869                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10870                         }
10871                 }
10872
10873                 close(readFile);
10874                 readFile = -1;
10875                 readSource = 0;
10876         }
10877
10878         XLByteToSeg(targetPagePtr, readSegNo);
10879
10880 retry:
10881         /* See if we need to retrieve more data */
10882         if (readFile < 0 ||
10883                 (readSource == XLOG_FROM_STREAM &&
10884                  receivedUpto < targetPagePtr + reqLen))
10885         {
10886                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
10887                                                                                  private->randAccess,
10888                                                                                  private->fetching_ckpt,
10889                                                                                  targetRecPtr))
10890                 {
10891                         if (readFile >= 0)
10892                                 close(readFile);
10893                         readFile = -1;
10894                         readLen = 0;
10895                         readSource = 0;
10896
10897                         return -1;
10898                 }
10899         }
10900
10901         /*
10902          * At this point, we have the right segment open and if we're streaming we
10903          * know the requested record is in it.
10904          */
10905         Assert(readFile != -1);
10906
10907         /*
10908          * If the current segment is being streamed from master, calculate how
10909          * much of the current page we have received already. We know the
10910          * requested record has been received, but this is for the benefit of
10911          * future calls, to allow quick exit at the top of this function.
10912          */
10913         if (readSource == XLOG_FROM_STREAM)
10914         {
10915                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
10916                         readLen = XLOG_BLCKSZ;
10917                 else
10918                         readLen = receivedUpto % XLogSegSize - targetPageOff;
10919         }
10920         else
10921                 readLen = XLOG_BLCKSZ;
10922
10923         /* Read the requested page */
10924         readOff = targetPageOff;
10925         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10926         {
10927                 char            fname[MAXFNAMELEN];
10928
10929                 XLogFileName(fname, curFileTLI, readSegNo);
10930                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10931                                 (errcode_for_file_access(),
10932                                  errmsg("could not seek in log segment %s to offset %u: %m",
10933                                                 fname, readOff)));
10934                 goto next_record_is_invalid;
10935         }
10936
10937         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10938         {
10939                 char            fname[MAXFNAMELEN];
10940
10941                 XLogFileName(fname, curFileTLI, readSegNo);
10942                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10943                                 (errcode_for_file_access(),
10944                                  errmsg("could not read from log segment %s, offset %u: %m",
10945                                                 fname, readOff)));
10946                 goto next_record_is_invalid;
10947         }
10948
10949         Assert(targetSegNo == readSegNo);
10950         Assert(targetPageOff == readOff);
10951         Assert(reqLen <= readLen);
10952
10953         *readTLI = curFileTLI;
10954         return readLen;
10955
10956 next_record_is_invalid:
10957         lastSourceFailed = true;
10958
10959         if (readFile >= 0)
10960                 close(readFile);
10961         readFile = -1;
10962         readLen = 0;
10963         readSource = 0;
10964
10965         /* In standby-mode, keep trying */
10966         if (StandbyMode)
10967                 goto retry;
10968         else
10969                 return -1;
10970 }
10971
10972 /*
10973  * Open the WAL segment containing WAL position 'RecPtr'.
10974  *
10975  * The segment can be fetched via restore_command, or via walreceiver having
10976  * streamed the record, or it can already be present in pg_xlog. Checking
10977  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
10978  * too, in case someone copies a new segment directly to pg_xlog. That is not
10979  * documented or recommended, though.
10980  *
10981  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
10982  * prepare to read WAL starting from RedoStartLSN after this.
10983  *
10984  * 'RecPtr' might not point to the beginning of the record we're interested
10985  * in, it might also point to the page or segment header. In that case,
10986  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
10987  * used to decide which timeline to stream the requested WAL from.
10988  *
10989  * If the record is not immediately available, the function returns false
10990  * if we're not in standby mode. In standby mode, waits for it to become
10991  * available.
10992  *
10993  * When the requested record becomes available, the function opens the file
10994  * containing it (if not open already), and returns true. When end of standby
10995  * mode is triggered by the user, and there is no more WAL available, returns
10996  * false.
10997  */
10998 static bool
10999 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11000                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
11001 {
11002         static pg_time_t last_fail_time = 0;
11003         pg_time_t       now;
11004
11005         /*-------
11006          * Standby mode is implemented by a state machine:
11007          *
11008          * 1. Read from archive (XLOG_FROM_ARCHIVE)
11009          * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
11010          * 3. Check trigger file
11011          * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11012          * 5. Rescan timelines
11013          * 6. Sleep 5 seconds, and loop back to 1.
11014          *
11015          * Failure to read from the current source advances the state machine to
11016          * the next state. In addition, successfully reading a file from pg_xlog
11017          * moves the state machine from state 2 back to state 1 (we always prefer
11018          * files in the archive over files in pg_xlog).
11019          *
11020          * 'currentSource' indicates the current state. There are no currentSource
11021          * values for "check trigger", "rescan timelines", and "sleep" states,
11022          * those actions are taken when reading from the previous source fails, as
11023          * part of advancing to the next state.
11024          *-------
11025          */
11026         if (!InArchiveRecovery)
11027                 currentSource = XLOG_FROM_PG_XLOG;
11028         else if (currentSource == 0)
11029                 currentSource = XLOG_FROM_ARCHIVE;
11030
11031         for (;;)
11032         {
11033                 int                     oldSource = currentSource;
11034
11035                 /*
11036                  * First check if we failed to read from the current source, and
11037                  * advance the state machine if so. The failure to read might've
11038                  * happened outside this function, e.g when a CRC check fails on a
11039                  * record, or within this loop.
11040                  */
11041                 if (lastSourceFailed)
11042                 {
11043                         switch (currentSource)
11044                         {
11045                                 case XLOG_FROM_ARCHIVE:
11046                                         currentSource = XLOG_FROM_PG_XLOG;
11047                                         break;
11048
11049                                 case XLOG_FROM_PG_XLOG:
11050
11051                                         /*
11052                                          * Check to see if the trigger file exists. Note that we
11053                                          * do this only after failure, so when you create the
11054                                          * trigger file, we still finish replaying as much as we
11055                                          * can from archive and pg_xlog before failover.
11056                                          */
11057                                         if (StandbyMode && CheckForStandbyTrigger())
11058                                         {
11059                                                 ShutdownWalRcv();
11060                                                 return false;
11061                                         }
11062
11063                                         /*
11064                                          * Not in standby mode, and we've now tried the archive
11065                                          * and pg_xlog.
11066                                          */
11067                                         if (!StandbyMode)
11068                                                 return false;
11069
11070                                         /*
11071                                          * If primary_conninfo is set, launch walreceiver to try
11072                                          * to stream the missing WAL.
11073                                          *
11074                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
11075                                          * checkpoint location. In that case, we use RedoStartLSN
11076                                          * as the streaming start position instead of RecPtr, so
11077                                          * that when we later jump backwards to start redo at
11078                                          * RedoStartLSN, we will have the logs streamed already.
11079                                          */
11080                                         if (PrimaryConnInfo)
11081                                         {
11082                                                 XLogRecPtr      ptr;
11083                                                 TimeLineID      tli;
11084
11085                                                 if (fetching_ckpt)
11086                                                 {
11087                                                         ptr = RedoStartLSN;
11088                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
11089                                                 }
11090                                                 else
11091                                                 {
11092                                                         ptr = tliRecPtr;
11093                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
11094
11095                                                         if (curFileTLI > 0 && tli < curFileTLI)
11096                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11097                                                                          (uint32) (ptr >> 32), (uint32) ptr,
11098                                                                          tli, curFileTLI);
11099                                                 }
11100                                                 curFileTLI = tli;
11101                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
11102                                                                                          PrimarySlotName);
11103                                                 receivedUpto = 0;
11104                                         }
11105
11106                                         /*
11107                                          * Move to XLOG_FROM_STREAM state in either case. We'll
11108                                          * get immediate failure if we didn't launch walreceiver,
11109                                          * and move on to the next state.
11110                                          */
11111                                         currentSource = XLOG_FROM_STREAM;
11112                                         break;
11113
11114                                 case XLOG_FROM_STREAM:
11115
11116                                         /*
11117                                          * Failure while streaming. Most likely, we got here
11118                                          * because streaming replication was terminated, or
11119                                          * promotion was triggered. But we also get here if we
11120                                          * find an invalid record in the WAL streamed from master,
11121                                          * in which case something is seriously wrong. There's
11122                                          * little chance that the problem will just go away, but
11123                                          * PANIC is not good for availability either, especially
11124                                          * in hot standby mode. So, we treat that the same as
11125                                          * disconnection, and retry from archive/pg_xlog again.
11126                                          * The WAL in the archive should be identical to what was
11127                                          * streamed, so it's unlikely that it helps, but one can
11128                                          * hope...
11129                                          */
11130
11131                                         /*
11132                                          * Before we leave XLOG_FROM_STREAM state, make sure that
11133                                          * walreceiver is not active, so that it won't overwrite
11134                                          * WAL that we restore from archive.
11135                                          */
11136                                         if (WalRcvStreaming())
11137                                                 ShutdownWalRcv();
11138
11139                                         /*
11140                                          * Before we sleep, re-scan for possible new timelines if
11141                                          * we were requested to recover to the latest timeline.
11142                                          */
11143                                         if (recoveryTargetIsLatest)
11144                                         {
11145                                                 if (rescanLatestTimeLine())
11146                                                 {
11147                                                         currentSource = XLOG_FROM_ARCHIVE;
11148                                                         break;
11149                                                 }
11150                                         }
11151
11152                                         /*
11153                                          * XLOG_FROM_STREAM is the last state in our state
11154                                          * machine, so we've exhausted all the options for
11155                                          * obtaining the requested WAL. We're going to loop back
11156                                          * and retry from the archive, but if it hasn't been long
11157                                          * since last attempt, sleep 5 seconds to avoid
11158                                          * busy-waiting.
11159                                          */
11160                                         now = (pg_time_t) time(NULL);
11161                                         if ((now - last_fail_time) < 5)
11162                                         {
11163                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
11164                                                 now = (pg_time_t) time(NULL);
11165                                         }
11166                                         last_fail_time = now;
11167                                         currentSource = XLOG_FROM_ARCHIVE;
11168                                         break;
11169
11170                                 default:
11171                                         elog(ERROR, "unexpected WAL source %d", currentSource);
11172                         }
11173                 }
11174                 else if (currentSource == XLOG_FROM_PG_XLOG)
11175                 {
11176                         /*
11177                          * We just successfully read a file in pg_xlog. We prefer files in
11178                          * the archive over ones in pg_xlog, so try the next file again
11179                          * from the archive first.
11180                          */
11181                         if (InArchiveRecovery)
11182                                 currentSource = XLOG_FROM_ARCHIVE;
11183                 }
11184
11185                 if (currentSource != oldSource)
11186                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
11187                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
11188                                  lastSourceFailed ? "failure" : "success");
11189
11190                 /*
11191                  * We've now handled possible failure. Try to read from the chosen
11192                  * source.
11193                  */
11194                 lastSourceFailed = false;
11195
11196                 switch (currentSource)
11197                 {
11198                         case XLOG_FROM_ARCHIVE:
11199                         case XLOG_FROM_PG_XLOG:
11200                                 /* Close any old file we might have open. */
11201                                 if (readFile >= 0)
11202                                 {
11203                                         close(readFile);
11204                                         readFile = -1;
11205                                 }
11206                                 /* Reset curFileTLI if random fetch. */
11207                                 if (randAccess)
11208                                         curFileTLI = 0;
11209
11210                                 /*
11211                                  * Try to restore the file from archive, or read an existing
11212                                  * file from pg_xlog.
11213                                  */
11214                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
11215                                 if (readFile >= 0)
11216                                         return true;    /* success! */
11217
11218                                 /*
11219                                  * Nope, not found in archive or pg_xlog.
11220                                  */
11221                                 lastSourceFailed = true;
11222                                 break;
11223
11224                         case XLOG_FROM_STREAM:
11225                                 {
11226                                         bool            havedata;
11227
11228                                         /*
11229                                          * Check if WAL receiver is still active.
11230                                          */
11231                                         if (!WalRcvStreaming())
11232                                         {
11233                                                 lastSourceFailed = true;
11234                                                 break;
11235                                         }
11236
11237                                         /*
11238                                          * Walreceiver is active, so see if new data has arrived.
11239                                          *
11240                                          * We only advance XLogReceiptTime when we obtain fresh
11241                                          * WAL from walreceiver and observe that we had already
11242                                          * processed everything before the most recent "chunk"
11243                                          * that it flushed to disk.  In steady state where we are
11244                                          * keeping up with the incoming data, XLogReceiptTime will
11245                                          * be updated on each cycle. When we are behind,
11246                                          * XLogReceiptTime will not advance, so the grace time
11247                                          * allotted to conflicting queries will decrease.
11248                                          */
11249                                         if (RecPtr < receivedUpto)
11250                                                 havedata = true;
11251                                         else
11252                                         {
11253                                                 XLogRecPtr      latestChunkStart;
11254
11255                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
11256                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
11257                                                 {
11258                                                         havedata = true;
11259                                                         if (latestChunkStart <= RecPtr)
11260                                                         {
11261                                                                 XLogReceiptTime = GetCurrentTimestamp();
11262                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
11263                                                         }
11264                                                 }
11265                                                 else
11266                                                         havedata = false;
11267                                         }
11268                                         if (havedata)
11269                                         {
11270                                                 /*
11271                                                  * Great, streamed far enough.  Open the file if it's
11272                                                  * not open already.  Also read the timeline history
11273                                                  * file if we haven't initialized timeline history
11274                                                  * yet; it should be streamed over and present in
11275                                                  * pg_xlog by now.      Use XLOG_FROM_STREAM so that
11276                                                  * source info is set correctly and XLogReceiptTime
11277                                                  * isn't changed.
11278                                                  */
11279                                                 if (readFile < 0)
11280                                                 {
11281                                                         if (!expectedTLEs)
11282                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
11283                                                         readFile = XLogFileRead(readSegNo, PANIC,
11284                                                                                                         receiveTLI,
11285                                                                                                         XLOG_FROM_STREAM, false);
11286                                                         Assert(readFile >= 0);
11287                                                 }
11288                                                 else
11289                                                 {
11290                                                         /* just make sure source info is correct... */
11291                                                         readSource = XLOG_FROM_STREAM;
11292                                                         XLogReceiptSource = XLOG_FROM_STREAM;
11293                                                         return true;
11294                                                 }
11295                                                 break;
11296                                         }
11297
11298                                         /*
11299                                          * Data not here yet. Check for trigger, then wait for
11300                                          * walreceiver to wake us up when new WAL arrives.
11301                                          */
11302                                         if (CheckForStandbyTrigger())
11303                                         {
11304                                                 /*
11305                                                  * Note that we don't "return false" immediately here.
11306                                                  * After being triggered, we still want to replay all
11307                                                  * the WAL that was already streamed. It's in pg_xlog
11308                                                  * now, so we just treat this as a failure, and the
11309                                                  * state machine will move on to replay the streamed
11310                                                  * WAL from pg_xlog, and then recheck the trigger and
11311                                                  * exit replay.
11312                                                  */
11313                                                 lastSourceFailed = true;
11314                                                 break;
11315                                         }
11316
11317                                         /*
11318                                          * Wait for more WAL to arrive. Time out after 5 seconds,
11319                                          * like when polling the archive, to react to a trigger
11320                                          * file promptly.
11321                                          */
11322                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
11323                                                           WL_LATCH_SET | WL_TIMEOUT,
11324                                                           5000L);
11325                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
11326                                         break;
11327                                 }
11328
11329                         default:
11330                                 elog(ERROR, "unexpected WAL source %d", currentSource);
11331                 }
11332
11333                 /*
11334                  * This possibly-long loop needs to handle interrupts of startup
11335                  * process.
11336                  */
11337                 HandleStartupProcInterrupts();
11338         } while (StandbyMode);
11339
11340         return false;
11341 }
11342
11343 /*
11344  * Determine what log level should be used to report a corrupt WAL record
11345  * in the current WAL page, previously read by XLogPageRead().
11346  *
11347  * 'emode' is the error mode that would be used to report a file-not-found
11348  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
11349  * we're retrying the exact same record that we've tried previously, only
11350  * complain the first time to keep the noise down.      However, we only do when
11351  * reading from pg_xlog, because we don't expect any invalid records in archive
11352  * or in records streamed from master. Files in the archive should be complete,
11353  * and we should never hit the end of WAL because we stop and wait for more WAL
11354  * to arrive before replaying it.
11355  *
11356  * NOTE: This function remembers the RecPtr value it was last called with,
11357  * to suppress repeated messages about the same record. Only call this when
11358  * you are about to ereport(), or you might cause a later message to be
11359  * erroneously suppressed.
11360  */
11361 static int
11362 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
11363 {
11364         static XLogRecPtr lastComplaint = 0;
11365
11366         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
11367         {
11368                 if (RecPtr == lastComplaint)
11369                         emode = DEBUG1;
11370                 else
11371                         lastComplaint = RecPtr;
11372         }
11373         return emode;
11374 }
11375
11376 /*
11377  * Check to see whether the user-specified trigger file exists and whether a
11378  * promote request has arrived.  If either condition holds, return true.
11379  */
11380 static bool
11381 CheckForStandbyTrigger(void)
11382 {
11383         struct stat stat_buf;
11384         static bool triggered = false;
11385
11386         if (triggered)
11387                 return true;
11388
11389         if (IsPromoteTriggered())
11390         {
11391                 /*
11392                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
11393                  * signal handler. It now leaves the file in place and lets the
11394                  * Startup process do the unlink. This allows Startup to know whether
11395                  * it should create a full checkpoint before starting up (fallback
11396                  * mode). Fast promotion takes precedence.
11397                  */
11398                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11399                 {
11400                         unlink(PROMOTE_SIGNAL_FILE);
11401                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11402                         fast_promote = true;
11403                 }
11404                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11405                 {
11406                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11407                         fast_promote = false;
11408                 }
11409
11410                 ereport(LOG, (errmsg("received promote request")));
11411
11412                 ResetPromoteTriggered();
11413                 triggered = true;
11414                 return true;
11415         }
11416
11417         if (TriggerFile == NULL)
11418                 return false;
11419
11420         if (stat(TriggerFile, &stat_buf) == 0)
11421         {
11422                 ereport(LOG,
11423                                 (errmsg("trigger file found: %s", TriggerFile)));
11424                 unlink(TriggerFile);
11425                 triggered = true;
11426                 fast_promote = true;
11427                 return true;
11428         }
11429         return false;
11430 }
11431
11432 /*
11433  * Check to see if a promote request has arrived. Should be
11434  * called by postmaster after receiving SIGUSR1.
11435  */
11436 bool
11437 CheckPromoteSignal(void)
11438 {
11439         struct stat stat_buf;
11440
11441         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
11442                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11443                 return true;
11444
11445         return false;
11446 }
11447
11448 /*
11449  * Wake up startup process to replay newly arrived WAL, or to notice that
11450  * failover has been requested.
11451  */
11452 void
11453 WakeupRecovery(void)
11454 {
11455         SetLatch(&XLogCtl->recoveryWakeupLatch);
11456 }
11457
11458 /*
11459  * Update the WalWriterSleeping flag.
11460  */
11461 void
11462 SetWalWriterSleeping(bool sleeping)
11463 {
11464         /* use volatile pointer to prevent code rearrangement */
11465         volatile XLogCtlData *xlogctl = XLogCtl;
11466
11467         SpinLockAcquire(&xlogctl->info_lck);
11468         xlogctl->WalWriterSleeping = sleeping;
11469         SpinLockRelease(&xlogctl->info_lck);
11470 }