granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <time.h>
  19 #include <fcntl.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <unistd.h>
  23
  24 #include "access/clog.h"
  25 #include "access/multixact.h"
  26 #include "access/subtrans.h"
  27 #include "access/timeline.h"
  28 #include "access/transam.h"
  29 #include "access/tuptoaster.h"
  30 #include "access/twophase.h"
  31 #include "access/xact.h"
  32 #include "access/xlog_internal.h"
  33 #include "access/xlogreader.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_database.h"
  38 #include "miscadmin.h"
  39 #include "pgstat.h"
  40 #include "postmaster/bgwriter.h"
  41 #include "postmaster/startup.h"
  42 #include "replication/walreceiver.h"
  43 #include "replication/walsender.h"
  44 #include "storage/barrier.h"
  45 #include "storage/bufmgr.h"
  46 #include "storage/fd.h"
  47 #include "storage/ipc.h"
  48 #include "storage/latch.h"
  49 #include "storage/pmsignal.h"
  50 #include "storage/predicate.h"
  51 #include "storage/proc.h"
  52 #include "storage/procarray.h"
  53 #include "storage/reinit.h"
  54 #include "storage/smgr.h"
  55 #include "storage/spin.h"
  56 #include "utils/builtins.h"
  57 #include "utils/guc.h"
  58 #include "utils/ps_status.h"
  59 #include "utils/relmapper.h"
  60 #include "utils/snapmgr.h"
  61 #include "utils/timestamp.h"
  62 #include "pg_trace.h"
  63
  64 extern uint32 bootstrap_data_checksum_version;
  65
  66 /* File path names (all relative to $PGDATA) */
  67 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  68 #define RECOVERY_COMMAND_DONE   "recovery.done"
  69 #define PROMOTE_SIGNAL_FILE             "promote"
  70 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
  71
  72
  73 /* User-settable parameters */
  74 int                     CheckPointSegments = 3;
  75 int                     wal_keep_segments = 0;
  76 int                     XLOGbuffers = -1;
  77 int                     XLogArchiveTimeout = 0;
  78 bool            XLogArchiveMode = false;
  79 char       *XLogArchiveCommand = NULL;
  80 bool            EnableHotStandby = false;
  81 bool            fullPageWrites = true;
  82 bool            wal_log_hints = false;
  83 bool            log_checkpoints = false;
  84 int                     sync_method = DEFAULT_SYNC_METHOD;
  85 int                     wal_level = WAL_LEVEL_MINIMAL;
  86 int                     CommitDelay = 0;        /* precommit delay in microseconds */
  87 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  88 int                     num_xloginsert_slots = 8;
  89
  90 #ifdef WAL_DEBUG
  91 bool            XLOG_DEBUG = false;
  92 #endif
  93
  94 /*
  95  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  96  * When we are done with an old XLOG segment file, we will recycle it as a
  97  * future XLOG segment as long as there aren't already XLOGfileslop future
  98  * segments; else we'll delete it.  This could be made a separate GUC
  99  * variable, but at present I think it's sufficient to hardwire it as
 100  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
 101  * no more than 2*CheckPointSegments log segments, and we want to recycle all
 102  * of them; the +1 allows boundary cases to happen without wasting a
 103  * delete/create-segment cycle.
 104  */
 105 #define XLOGfileslop    (2*CheckPointSegments + 1)
 106
 107
 108 /*
 109  * GUC support
 110  */
 111 const struct config_enum_entry sync_method_options[] = {
 112         {"fsync", SYNC_METHOD_FSYNC, false},
 113 #ifdef HAVE_FSYNC_WRITETHROUGH
 114         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 115 #endif
 116 #ifdef HAVE_FDATASYNC
 117         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 118 #endif
 119 #ifdef OPEN_SYNC_FLAG
 120         {"open_sync", SYNC_METHOD_OPEN, false},
 121 #endif
 122 #ifdef OPEN_DATASYNC_FLAG
 123         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 124 #endif
 125         {NULL, 0, false}
 126 };
 127
 128 /*
 129  * Statistics for current checkpoint are collected in this global struct.
 130  * Because only the background writer or a stand-alone backend can perform
 131  * checkpoints, this will be unused in normal backends.
 132  */
 133 CheckpointStatsData CheckpointStats;
 134
 135 /*
 136  * ThisTimeLineID will be same in all backends --- it identifies current
 137  * WAL timeline for the database system.
 138  */
 139 TimeLineID      ThisTimeLineID = 0;
 140
 141 /*
 142  * Are we doing recovery from XLOG?
 143  *
 144  * This is only ever true in the startup process; it should be read as meaning
 145  * "this process is replaying WAL records", rather than "the system is in
 146  * recovery mode".  It should be examined primarily by functions that need
 147  * to act differently when called from a WAL redo function (e.g., to skip WAL
 148  * logging).  To check whether the system is in recovery regardless of which
 149  * process you're running in, use RecoveryInProgress() but only after shared
 150  * memory startup and lock initialization.
 151  */
 152 bool            InRecovery = false;
 153
 154 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 155 HotStandbyState standbyState = STANDBY_DISABLED;
 156
 157 static XLogRecPtr LastRec;
 158
 159 /* Local copy of WalRcv->receivedUpto */
 160 static XLogRecPtr receivedUpto = 0;
 161 static TimeLineID receiveTLI = 0;
 162
 163 /*
 164  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 165  * the replayed WAL records indicate. It's initialized with full_page_writes
 166  * that the recovery starting checkpoint record indicates, and then updated
 167  * each time XLOG_FPW_CHANGE record is replayed.
 168  */
 169 static bool lastFullPageWrites;
 170
 171 /*
 172  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 173  * known, need to check the shared state".
 174  */
 175 static bool LocalRecoveryInProgress = true;
 176
 177 /*
 178  * Local copy of SharedHotStandbyActive variable. False actually means "not
 179  * known, need to check the shared state".
 180  */
 181 static bool LocalHotStandbyActive = false;
 182
 183 /*
 184  * Local state for XLogInsertAllowed():
 185  *              1: unconditionally allowed to insert XLOG
 186  *              0: unconditionally not allowed to insert XLOG
 187  *              -1: must check RecoveryInProgress(); disallow until it is false
 188  * Most processes start with -1 and transition to 1 after seeing that recovery
 189  * is not in progress.  But we can also force the value for special cases.
 190  * The coding in XLogInsertAllowed() depends on the first two of these states
 191  * being numerically the same as bool true and false.
 192  */
 193 static int      LocalXLogInsertAllowed = -1;
 194
 195 /*
 196  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 197  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 198  * currently recovering using offline XLOG archives. These variables are only
 199  * valid in the startup process.
 200  *
 201  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 202  * currently performing crash recovery using only XLOG files in pg_xlog, but
 203  * will switch to using offline XLOG archives as soon as we reach the end of
 204  * WAL in pg_xlog.
 205 */
 206 bool            ArchiveRecoveryRequested = false;
 207 bool            InArchiveRecovery = false;
 208
 209 /* Was the last xlog file restored from archive, or local? */
 210 static bool restoredFromArchive = false;
 211
 212 /* options taken from recovery.conf for archive recovery */
 213 char       *recoveryRestoreCommand = NULL;
 214 static char *recoveryEndCommand = NULL;
 215 static char *archiveCleanupCommand = NULL;
 216 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 217 static bool recoveryTargetInclusive = true;
 218 static bool recoveryPauseAtTarget = true;
 219 static TransactionId recoveryTargetXid;
 220 static TimestampTz recoveryTargetTime;
 221 static char *recoveryTargetName;
 222 static int min_recovery_apply_delay = 0;
 223 static TimestampTz recoveryDelayUntilTime;
 224
 225 /* options taken from recovery.conf for XLOG streaming */
 226 static bool StandbyModeRequested = false;
 227 static char *PrimaryConnInfo = NULL;
 228 static char *TriggerFile = NULL;
 229
 230 /* are we currently in standby mode? */
 231 bool            StandbyMode = false;
 232
 233 /* whether request for fast promotion has been made yet */
 234 static bool fast_promote = false;
 235
 236 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
 237 static TransactionId recoveryStopXid;
 238 static TimestampTz recoveryStopTime;
 239 static char recoveryStopName[MAXFNAMELEN];
 240 static bool recoveryStopAfter;
 241
 242 /*
 243  * During normal operation, the only timeline we care about is ThisTimeLineID.
 244  * During recovery, however, things are more complicated.  To simplify life
 245  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 246  * scan through the WAL history (that is, it is the line that was active when
 247  * the currently-scanned WAL record was generated).  We also need these
 248  * timeline values:
 249  *
 250  * recoveryTargetTLI: the desired timeline that we want to end in.
 251  *
 252  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 253  *
 254  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 255  * its known parents, newest first (so recoveryTargetTLI is always the
 256  * first list member).  Only these TLIs are expected to be seen in the WAL
 257  * segments we read, and indeed only these TLIs will be considered as
 258  * candidate WAL files to open at all.
 259  *
 260  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 261  * (This is not necessarily the same as ThisTimeLineID, because we could
 262  * be scanning data that was copied from an ancestor timeline when the current
 263  * file was created.)  During a sequential scan we do not allow this value
 264  * to decrease.
 265  */
 266 static TimeLineID recoveryTargetTLI;
 267 static bool recoveryTargetIsLatest = false;
 268 static List *expectedTLEs;
 269 static TimeLineID curFileTLI;
 270
 271 /*
 272  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 273  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 274  * end+1 of the last record, and is reset when we end a top-level transaction,
 275  * or start a new one; so it can be used to tell if the current transaction has
 276  * created any XLOG records.
 277  */
 278 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
 279
 280 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 281
 282 /*
 283  * RedoRecPtr is this backend's local copy of the REDO record pointer
 284  * (which is almost but not quite the same as a pointer to the most recent
 285  * CHECKPOINT record).  We update this from the shared-memory copy,
 286  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 287  * hold an insertion slot).  See XLogInsert for details.  We are also allowed
 288  * to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 289  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 290  * InitXLOGAccess.
 291  */
 292 static XLogRecPtr RedoRecPtr;
 293
 294 /*
 295  * RedoStartLSN points to the checkpoint's REDO location which is specified
 296  * in a backup label file, backup history file or control file. In standby
 297  * mode, XLOG streaming usually starts from the position where an invalid
 298  * record was found. But if we fail to read even the initial checkpoint
 299  * record, we use the REDO location instead of the checkpoint location as
 300  * the start position of XLOG streaming. Otherwise we would have to jump
 301  * backwards to the REDO location after reading the checkpoint record,
 302  * because the REDO record can precede the checkpoint record.
 303  */
 304 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 305
 306 /*----------
 307  * Shared-memory data structures for XLOG control
 308  *
 309  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 310  * the log up to (all records before that point must be written or fsynced).
 311  * LogwrtResult indicates the byte positions we have already written/fsynced.
 312  * These structs are identical but are declared separately to indicate their
 313  * slightly different functions.
 314  *
 315  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 316  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 317  * this arrangement is that the value can be examined by code that already
 318  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 319  * to the shared variable, each backend has a private copy of LogwrtResult,
 320  * which is updated when convenient.
 321  *
 322  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 323  * (protected by info_lck), but we don't need to cache any copies of it.
 324  *
 325  * info_lck is only held long enough to read/update the protected variables,
 326  * so it's a plain spinlock.  The other locks are held longer (potentially
 327  * over I/O operations), so we use LWLocks for them.  These locks are:
 328  *
 329  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 330  * It is only held while initializing and changing the mapping.  If the
 331  * contents of the buffer being replaced haven't been written yet, the mapping
 332  * lock is released while the write is done, and reacquired afterwards.
 333  *
 334  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 335  * XLogFlush).
 336  *
 337  * ControlFileLock: must be held to read/update control file or create
 338  * new log file.
 339  *
 340  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 341  * only one checkpointer at a time; currently, with all checkpoints done by
 342  * the checkpointer, this is just pro forma).
 343  *
 344  *----------
 345  */
 346
 347 typedef struct XLogwrtRqst
 348 {
 349         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 350         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 351 } XLogwrtRqst;
 352
 353 typedef struct XLogwrtResult
 354 {
 355         XLogRecPtr      Write;                  /* last byte + 1 written out */
 356         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 357 } XLogwrtResult;
 358
 359
 360 /*
 361  * A slot for inserting to the WAL. This is similar to an LWLock, the main
 362  * difference is that there is an extra xlogInsertingAt field that is protected
 363  * by the same mutex. Unlike an LWLock, a slot can only be acquired in
 364  * exclusive mode.
 365  *
 366  * The xlogInsertingAt field is used to advertise to other processes how far
 367  * the slot owner has progressed in inserting the record. When a backend
 368  * acquires a slot, it initializes xlogInsertingAt to 1, because it doesn't
 369  * yet know where it's going to insert the record. That's conservative
 370  * but correct; the new insertion is certainly going to go to a byte position
 371  * greater than 1. If another backend needs to flush the WAL, it will have to
 372  * wait for the new insertion. xlogInsertingAt is updated after finishing the
 373  * insert or when crossing a page boundary, which will wake up anyone waiting
 374  * for it, whether the wait was necessary in the first place or not.
 375  *
 376  * A process can wait on a slot in two modes: LW_EXCLUSIVE or
 377  * LW_WAIT_UNTIL_FREE. LW_EXCLUSIVE works like in an lwlock; when the slot is
 378  * released, the first LW_EXCLUSIVE waiter in the queue is woken up. Processes
 379  * waiting in LW_WAIT_UNTIL_FREE mode are woken up whenever the slot is
 380  * released, or xlogInsertingAt is updated. In other words, a process in
 381  * LW_WAIT_UNTIL_FREE mode is woken up whenever the inserter makes any progress
 382  * copying the record in place. LW_WAIT_UNTIL_FREE waiters are always added to
 383  * the front of the queue, while LW_EXCLUSIVE waiters are appended to the end.
 384  *
 385  * To join the wait queue, a process must set MyProc->lwWaitMode to the mode
 386  * it wants to wait in, MyProc->lwWaiting to true, and link MyProc to the head
 387  * or tail of the wait queue. The same mechanism is used to wait on an LWLock,
 388  * see lwlock.c for details.
 389  */
 390 typedef struct
 391 {
 392         slock_t         mutex;                  /* protects the below fields */
 393         XLogRecPtr      xlogInsertingAt; /* insert has completed up to this point */
 394
 395         PGPROC     *owner;                      /* for debugging purposes */
 396
 397         bool            releaseOK;              /* T if ok to release waiters */
 398         char            exclusive;              /* # of exclusive holders (0 or 1) */
 399         PGPROC     *head;                       /* head of list of waiting PGPROCs */
 400         PGPROC     *tail;                       /* tail of list of waiting PGPROCs */
 401         /* tail is undefined when head is NULL */
 402 } XLogInsertSlot;
 403
 404 /*
 405  * All the slots are allocated as an array in shared memory. We force the
 406  * array stride to be a power of 2, which saves a few cycles in indexing, but
 407  * more importantly also ensures that individual slots don't cross cache line
 408  * boundaries.  (Of course, we have to also ensure that the array start
 409  * address is suitably aligned.)
 410  */
 411 typedef union XLogInsertSlotPadded
 412 {
 413         XLogInsertSlot slot;
 414         char            pad[CACHE_LINE_SIZE];
 415 } XLogInsertSlotPadded;
 416
 417 /*
 418  * Shared state data for XLogInsert.
 419  */
 420 typedef struct XLogCtlInsert
 421 {
 422         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 423
 424         /*
 425          * CurrBytePos is the end of reserved WAL. The next record will be inserted
 426          * at that position. PrevBytePos is the start position of the previously
 427          * inserted (or rather, reserved) record - it is copied to the prev-link
 428          * of the next record. These are stored as "usable byte positions" rather
 429          * than XLogRecPtrs (see XLogBytePosToRecPtr()).
 430          */
 431         uint64          CurrBytePos;
 432         uint64          PrevBytePos;
 433
 434         /*
 435          * Make sure the above heavily-contended spinlock and byte positions are
 436          * on their own cache line. In particular, the RedoRecPtr and full page
 437          * write variables below should be on a different cache line. They are
 438          * read on every WAL insertion, but updated rarely, and we don't want
 439          * those reads to steal the cache line containing Curr/PrevBytePos.
 440          */
 441         char            pad[CACHE_LINE_SIZE];
 442
 443         /*
 444          * fullPageWrites is the master copy used by all backends to determine
 445          * whether to write full-page to WAL, instead of using process-local one.
 446          * This is required because, when full_page_writes is changed by SIGHUP,
 447          * we must WAL-log it before it actually affects WAL-logging by backends.
 448          * Checkpointer sets at startup or after SIGHUP.
 449          *
 450          * To read these fields, you must hold an insertion slot. To modify them,
 451          * you must hold ALL the slots.
 452          */
 453         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 454         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 455         bool            fullPageWrites;
 456
 457         /*
 458          * exclusiveBackup is true if a backup started with pg_start_backup() is
 459          * in progress, and nonExclusiveBackups is a counter indicating the number
 460          * of streaming base backups currently in progress. forcePageWrites is set
 461          * to true when either of these is non-zero. lastBackupStart is the latest
 462          * checkpoint redo location used as a starting point for an online backup.
 463          */
 464         bool            exclusiveBackup;
 465         int                     nonExclusiveBackups;
 466         XLogRecPtr      lastBackupStart;
 467
 468         /* insertion slots, see XLogInsertSlot struct above for details */
 469         XLogInsertSlotPadded *insertSlots;
 470 } XLogCtlInsert;
 471
 472 /*
 473  * Total shared-memory state for XLOG.
 474  */
 475 typedef struct XLogCtlData
 476 {
 477         XLogCtlInsert Insert;
 478
 479         /* Protected by info_lck: */
 480         XLogwrtRqst LogwrtRqst;
 481         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 482         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 483         TransactionId ckptXid;
 484         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 485         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
 486                                                                                  * segment */
 487
 488         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
 489         XLogRecPtr      unloggedLSN;
 490         slock_t         ulsn_lck;
 491
 492         /* Time of last xlog segment switch. Protected by WALWriteLock. */
 493         pg_time_t       lastSegSwitchTime;
 494
 495         /*
 496          * Protected by info_lck and WALWriteLock (you must hold either lock to
 497          * read it, but both to update)
 498          */
 499         XLogwrtResult LogwrtResult;
 500
 501         /*
 502          * Latest initialized page in the cache (last byte position + 1).
 503          *
 504          * To change the identity of a buffer (and InitializedUpTo), you need to
 505          * hold WALBufMappingLock.  To change the identity of a buffer that's still
 506          * dirty, the old page needs to be written out first, and for that you
 507          * need WALWriteLock, and you need to ensure that there are no in-progress
 508          * insertions to the page by calling WaitXLogInsertionsToFinish().
 509          */
 510         XLogRecPtr      InitializedUpTo;
 511
 512         /*
 513          * These values do not change after startup, although the pointed-to pages
 514          * and xlblocks values certainly do.  xlblock values are protected by
 515          * WALBufMappingLock.
 516          */
 517         char       *pages;                      /* buffers for unwritten XLOG pages */
 518         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 519         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 520
 521         /*
 522          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 523          * If we created a new timeline when the system was started up,
 524          * PrevTimeLineID is the old timeline's ID that we forked off from.
 525          * Otherwise it's equal to ThisTimeLineID.
 526          */
 527         TimeLineID      ThisTimeLineID;
 528         TimeLineID      PrevTimeLineID;
 529
 530         /*
 531          * archiveCleanupCommand is read from recovery.conf but needs to be in
 532          * shared memory so that the checkpointer process can access it.
 533          */
 534         char            archiveCleanupCommand[MAXPGPATH];
 535
 536         /*
 537          * SharedRecoveryInProgress indicates if we're still in crash or archive
 538          * recovery.  Protected by info_lck.
 539          */
 540         bool            SharedRecoveryInProgress;
 541
 542         /*
 543          * SharedHotStandbyActive indicates if we're still in crash or archive
 544          * recovery.  Protected by info_lck.
 545          */
 546         bool            SharedHotStandbyActive;
 547
 548         /*
 549          * WalWriterSleeping indicates whether the WAL writer is currently in
 550          * low-power mode (and hence should be nudged if an async commit occurs).
 551          * Protected by info_lck.
 552          */
 553         bool            WalWriterSleeping;
 554
 555         /*
 556          * recoveryWakeupLatch is used to wake up the startup process to continue
 557          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 558          * to appear.
 559          */
 560         Latch           recoveryWakeupLatch;
 561
 562         /*
 563          * During recovery, we keep a copy of the latest checkpoint record here.
 564          * Used by the background writer when it wants to create a restartpoint.
 565          *
 566          * Protected by info_lck.
 567          */
 568         XLogRecPtr      lastCheckPointRecPtr;
 569         CheckPoint      lastCheckPoint;
 570
 571         /*
 572          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 573          * replayed. When we're currently replaying a record, ie. in a redo
 574          * function, replayEndRecPtr points to the end+1 of the record being
 575          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 576          */
 577         XLogRecPtr      lastReplayedEndRecPtr;
 578         TimeLineID      lastReplayedTLI;
 579         XLogRecPtr      replayEndRecPtr;
 580         TimeLineID      replayEndTLI;
 581         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 582         TimestampTz recoveryLastXTime;
 583         /* current effective recovery target timeline */
 584         TimeLineID      RecoveryTargetTLI;
 585
 586         /*
 587          * timestamp of when we started replaying the current chunk of WAL data,
 588          * only relevant for replication or archive recovery
 589          */
 590         TimestampTz currentChunkStartTime;
 591         /* Are we requested to pause recovery? */
 592         bool            recoveryPause;
 593
 594         /*
 595          * lastFpwDisableRecPtr points to the start of the last replayed
 596          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 597          */
 598         XLogRecPtr      lastFpwDisableRecPtr;
 599
 600         slock_t         info_lck;               /* locks shared variables shown above */
 601 } XLogCtlData;
 602
 603 static XLogCtlData *XLogCtl = NULL;
 604
 605 /*
 606  * We maintain an image of pg_control in shared memory.
 607  */
 608 static ControlFileData *ControlFile = NULL;
 609
 610 /*
 611  * Calculate the amount of space left on the page after 'endptr'. Beware
 612  * multiple evaluation!
 613  */
 614 #define INSERT_FREESPACE(endptr)        \
 615         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 616
 617 /* Macro to advance to next buffer index. */
 618 #define NextBufIdx(idx)         \
 619                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 620
 621 /*
 622  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 623  * would hold if it was in cache, the page containing 'recptr'.
 624  */
 625 #define XLogRecPtrToBufIdx(recptr)      \
 626         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 627
 628 /*
 629  * These are the number of bytes in a WAL page and segment usable for WAL data.
 630  */
 631 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 632 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
 633
 634 /*
 635  * Private, possibly out-of-date copy of shared LogwrtResult.
 636  * See discussion above.
 637  */
 638 static XLogwrtResult LogwrtResult = {0, 0};
 639
 640 /*
 641  * Codes indicating where we got a WAL file from during recovery, or where
 642  * to attempt to get one.
 643  */
 644 typedef enum
 645 {
 646         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 647         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 648         XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
 649         XLOG_FROM_STREAM,                       /* streamed from master */
 650 } XLogSource;
 651
 652 /* human-readable names for XLogSources, for debugging output */
 653 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
 654
 655 /*
 656  * openLogFile is -1 or a kernel FD for an open log file segment.
 657  * When it's open, openLogOff is the current seek offset in the file.
 658  * openLogSegNo identifies the segment.  These variables are only
 659  * used to write the XLOG, and so will normally refer to the active segment.
 660  */
 661 static int      openLogFile = -1;
 662 static XLogSegNo openLogSegNo = 0;
 663 static uint32 openLogOff = 0;
 664
 665 /*
 666  * These variables are used similarly to the ones above, but for reading
 667  * the XLOG.  Note, however, that readOff generally represents the offset
 668  * of the page just read, not the seek position of the FD itself, which
 669  * will be just past that page. readLen indicates how much of the current
 670  * page has been read into readBuf, and readSource indicates where we got
 671  * the currently open file from.
 672  */
 673 static int      readFile = -1;
 674 static XLogSegNo readSegNo = 0;
 675 static uint32 readOff = 0;
 676 static uint32 readLen = 0;
 677 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
 678
 679 /*
 680  * Keeps track of which source we're currently reading from. This is
 681  * different from readSource in that this is always set, even when we don't
 682  * currently have a WAL file open. If lastSourceFailed is set, our last
 683  * attempt to read from currentSource failed, and we should try another source
 684  * next.
 685  */
 686 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 687 static bool lastSourceFailed = false;
 688
 689 typedef struct XLogPageReadPrivate
 690 {
 691         int                     emode;
 692         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 693         bool            randAccess;
 694 } XLogPageReadPrivate;
 695
 696 /*
 697  * These variables track when we last obtained some WAL data to process,
 698  * and where we got it from.  (XLogReceiptSource is initially the same as
 699  * readSource, but readSource gets reset to zero when we don't have data
 700  * to process right now.  It is also different from currentSource, which
 701  * also changes when we try to read from a source and fail, while
 702  * XLogReceiptSource tracks where we last successfully read some WAL.)
 703  */
 704 static TimestampTz XLogReceiptTime = 0;
 705 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
 706
 707 /* State information for XLOG reading */
 708 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 709 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 710
 711 static XLogRecPtr minRecoveryPoint;             /* local copy of
 712                                                                                  * ControlFile->minRecoveryPoint */
 713 static TimeLineID minRecoveryPointTLI;
 714 static bool updateMinRecoveryPoint = true;
 715
 716 /*
 717  * Have we reached a consistent database state? In crash recovery, we have
 718  * to replay all the WAL, so reachedConsistency is never set. During archive
 719  * recovery, the database is consistent once minRecoveryPoint is reached.
 720  */
 721 bool            reachedConsistency = false;
 722
 723 static bool InRedo = false;
 724
 725 /* Have we launched bgwriter during recovery? */
 726 static bool bgwriterLaunched = false;
 727
 728 /* For WALInsertSlotAcquire/Release functions */
 729 static int      MySlotNo = 0;
 730 static bool holdingAllSlots = false;
 731
 732 static void readRecoveryCommandFile(void);
 733 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
 734 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis, bool *delayThis);
 735 static void recoveryPausesHere(void);
 736 static void recoveryApplyDelay(void);
 737 static bool SetRecoveryDelayUntilTime(TimestampTz xtime);
 738 static void SetLatestXTime(TimestampTz xtime);
 739 static void SetCurrentChunkStartTime(TimestampTz xtime);
 740 static void CheckRequiredParameterValues(void);
 741 static void XLogReportParameters(void);
 742 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 743                                         TimeLineID prevTLI);
 744 static void LocalSetXLogInsertAllowed(void);
 745 static void CreateEndOfRecoveryRecord(void);
 746 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 747 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 748
 749 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
 750                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 751 static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
 752                                                  char *blk, bool get_cleanup_lock, bool keep_buffer);
 753 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 754 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 755 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 756 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 757                                            bool find_free, int *max_advance,
 758                                            bool use_lock);
 759 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 760                          int source, bool notexistOk);
 761 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 762 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 763                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 764                          TimeLineID *readTLI);
 765 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 766                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 767 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 768 static void XLogFileClose(void);
 769 static void PreallocXlogFiles(XLogRecPtr endptr);
 770 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
 771 static void UpdateLastRemovedPtr(char *filename);
 772 static void ValidateXLOGDirectoryStructure(void);
 773 static void CleanupBackupHistory(void);
 774 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 775 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 776                    int emode, bool fetching_ckpt);
 777 static void CheckRecoveryConsistency(void);
 778 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 779                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 780 static bool rescanLatestTimeLine(void);
 781 static void WriteControlFile(void);
 782 static void ReadControlFile(void);
 783 static char *str_time(pg_time_t tnow);
 784 static bool CheckForStandbyTrigger(void);
 785
 786 #ifdef WAL_DEBUG
 787 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 788 #endif
 789 static void pg_start_backup_callback(int code, Datum arg);
 790 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 791                                   bool *backupEndRequired, bool *backupFromStandby);
 792 static void rm_redo_error_callback(void *arg);
 793 static int      get_sync_bit(int method);
 794
 795 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 796                                   XLogRecData *rdata,
 797                                   XLogRecPtr StartPos, XLogRecPtr EndPos);
 798 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 799                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 800 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 801                                   XLogRecPtr *PrevPtr);
 802 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 803 static void WakeupWaiters(XLogRecPtr EndPos);
 804 static char *GetXLogBuffer(XLogRecPtr ptr);
 805 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 806 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 807 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 808
 809 static void WALInsertSlotAcquire(bool exclusive);
 810 static void WALInsertSlotAcquireOne(int slotno);
 811 static void WALInsertSlotRelease(void);
 812 static void WALInsertSlotReleaseOne(int slotno);
 813
 814 /*
 815  * Insert an XLOG record having the specified RMID and info bytes,
 816  * with the body of the record being the data chunk(s) described by
 817  * the rdata chain (see xlog.h for notes about rdata).
 818  *
 819  * Returns XLOG pointer to end of record (beginning of next record).
 820  * This can be used as LSN for data pages affected by the logged action.
 821  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 822  * before the data page can be written out.  This implements the basic
 823  * WAL rule "write the log before the data".)
 824  *
 825  * NB: this routine feels free to scribble on the XLogRecData structs,
 826  * though not on the data they reference.  This is OK since the XLogRecData
 827  * structs are always just temporaries in the calling code.
 828  */
 829 XLogRecPtr
 830 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 831 {
 832         XLogCtlInsert *Insert = &XLogCtl->Insert;
 833         XLogRecData *rdt;
 834         XLogRecData *rdt_lastnormal;
 835         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 836         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 837         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 838         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 839         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 840         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 841         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 842         XLogRecData hdr_rdt;
 843         pg_crc32        rdata_crc;
 844         uint32          len,
 845                                 write_len;
 846         unsigned        i;
 847         bool            doPageWrites;
 848         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 849         bool            inserted;
 850         uint8           info_orig = info;
 851         static XLogRecord *rechdr;
 852         XLogRecPtr      StartPos;
 853         XLogRecPtr      EndPos;
 854
 855         if (rechdr == NULL)
 856         {
 857                 rechdr = malloc(SizeOfXLogRecord);
 858                 if (rechdr == NULL)
 859                         elog(ERROR, "out of memory");
 860                 MemSet(rechdr, 0, SizeOfXLogRecord);
 861         }
 862
 863         /* cross-check on whether we should be here or not */
 864         if (!XLogInsertAllowed())
 865                 elog(ERROR, "cannot make new WAL entries during recovery");
 866
 867         /* info's high bits are reserved for use by me */
 868         if (info & XLR_INFO_MASK)
 869                 elog(PANIC, "invalid xlog info mask %02X", info);
 870
 871         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 872
 873         /*
 874          * In bootstrap mode, we don't actually log anything but XLOG resources;
 875          * return a phony record pointer.
 876          */
 877         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 878         {
 879                 EndPos = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 880                 return EndPos;
 881         }
 882
 883         /*
 884          * Here we scan the rdata chain, to determine which buffers must be backed
 885          * up.
 886          *
 887          * We may have to loop back to here if a race condition is detected below.
 888          * We could prevent the race by doing all this work while holding an
 889          * insertion slot, but it seems better to avoid doing CRC calculations
 890          * while holding one.
 891          *
 892          * We add entries for backup blocks to the chain, so that they don't need
 893          * any special treatment in the critical section where the chunks are
 894          * copied into the WAL buffers. Those entries have to be unlinked from the
 895          * chain if we have to loop back here.
 896          */
 897 begin:;
 898         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 899         {
 900                 dtbuf[i] = InvalidBuffer;
 901                 dtbuf_bkp[i] = false;
 902         }
 903
 904         /*
 905          * Decide if we need to do full-page writes in this XLOG record: true if
 906          * full_page_writes is on or we have a PITR request for it.  Since we
 907          * don't yet have an insertion slot, fullPageWrites and forcePageWrites
 908          * could change under us, but we'll recheck them once we have a slot.
 909          */
 910         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 911
 912         len = 0;
 913         for (rdt = rdata;;)
 914         {
 915                 if (rdt->buffer == InvalidBuffer)
 916                 {
 917                         /* Simple data, just include it */
 918                         len += rdt->len;
 919                 }
 920                 else
 921                 {
 922                         /* Find info for buffer */
 923                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 924                         {
 925                                 if (rdt->buffer == dtbuf[i])
 926                                 {
 927                                         /* Buffer already referenced by earlier chain item */
 928                                         if (dtbuf_bkp[i])
 929                                         {
 930                                                 rdt->data = NULL;
 931                                                 rdt->len = 0;
 932                                         }
 933                                         else if (rdt->data)
 934                                                 len += rdt->len;
 935                                         break;
 936                                 }
 937                                 if (dtbuf[i] == InvalidBuffer)
 938                                 {
 939                                         /* OK, put it in this slot */
 940                                         dtbuf[i] = rdt->buffer;
 941                                         if (doPageWrites && XLogCheckBuffer(rdt, true,
 942                                                                                    &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 943                                         {
 944                                                 dtbuf_bkp[i] = true;
 945                                                 rdt->data = NULL;
 946                                                 rdt->len = 0;
 947                                         }
 948                                         else if (rdt->data)
 949                                                 len += rdt->len;
 950                                         break;
 951                                 }
 952                         }
 953                         if (i >= XLR_MAX_BKP_BLOCKS)
 954                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 955                                          XLR_MAX_BKP_BLOCKS);
 956                 }
 957                 /* Break out of loop when rdt points to last chain item */
 958                 if (rdt->next == NULL)
 959                         break;
 960                 rdt = rdt->next;
 961         }
 962
 963         /*
 964          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 965          * error checking in ReadRecord.  This means that all callers of
 966          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 967          * make an exception for XLOG SWITCH records because we don't want them to
 968          * ever cross a segment boundary.
 969          */
 970         if (len == 0 && !isLogSwitch)
 971                 elog(PANIC, "invalid xlog record length %u", len);
 972
 973         /*
 974          * Make additional rdata chain entries for the backup blocks, so that we
 975          * don't need to special-case them in the write loop.  This modifies the
 976          * original rdata chain, but we keep a pointer to the last regular entry,
 977          * rdt_lastnormal, so that we can undo this if we have to loop back to the
 978          * beginning.
 979          *
 980          * At the exit of this loop, write_len includes the backup block data.
 981          *
 982          * Also set the appropriate info bits to show which buffers were backed
 983          * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
 984          * value (ignoring InvalidBuffer) appearing in the rdata chain.
 985          */
 986         rdt_lastnormal = rdt;
 987         write_len = len;
 988         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 989         {
 990                 BkpBlock   *bkpb;
 991                 char       *page;
 992
 993                 if (!dtbuf_bkp[i])
 994                         continue;
 995
 996                 info |= XLR_BKP_BLOCK(i);
 997
 998                 bkpb = &(dtbuf_xlg[i]);
 999                 page = (char *) BufferGetBlock(dtbuf[i]);
1000
1001                 rdt->next = &(dtbuf_rdt1[i]);
1002                 rdt = rdt->next;
1003
1004                 rdt->data = (char *) bkpb;
1005                 rdt->len = sizeof(BkpBlock);
1006                 write_len += sizeof(BkpBlock);
1007
1008                 rdt->next = &(dtbuf_rdt2[i]);
1009                 rdt = rdt->next;
1010
1011                 if (bkpb->hole_length == 0)
1012                 {
1013                         rdt->data = page;
1014                         rdt->len = BLCKSZ;
1015                         write_len += BLCKSZ;
1016                         rdt->next = NULL;
1017                 }
1018                 else
1019                 {
1020                         /* must skip the hole */
1021                         rdt->data = page;
1022                         rdt->len = bkpb->hole_offset;
1023                         write_len += bkpb->hole_offset;
1024
1025                         rdt->next = &(dtbuf_rdt3[i]);
1026                         rdt = rdt->next;
1027
1028                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
1029                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
1030                         write_len += rdt->len;
1031                         rdt->next = NULL;
1032                 }
1033         }
1034
1035         /*
1036          * Calculate CRC of the data, including all the backup blocks
1037          *
1038          * Note that the record header isn't added into the CRC initially since we
1039          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
1040          * the whole record in the order: rdata, then backup blocks, then record
1041          * header.
1042          */
1043         INIT_CRC32(rdata_crc);
1044         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
1045                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
1046
1047         /*
1048          * Construct record header (prev-link is filled in later, after reserving
1049          * the space for the record), and make that the first chunk in the chain.
1050          *
1051          * The CRC calculated for the header here doesn't include prev-link,
1052          * because we don't know it yet. It will be added later.
1053          */
1054         rechdr->xl_xid = GetCurrentTransactionIdIfAny();
1055         rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
1056         rechdr->xl_len = len;           /* doesn't include backup blocks */
1057         rechdr->xl_info = info;
1058         rechdr->xl_rmid = rmid;
1059         rechdr->xl_prev = InvalidXLogRecPtr;
1060         COMP_CRC32(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev));
1061
1062         hdr_rdt.next = rdata;
1063         hdr_rdt.data = (char *) rechdr;
1064         hdr_rdt.len = SizeOfXLogRecord;
1065         write_len += SizeOfXLogRecord;
1066
1067         /*----------
1068          *
1069          * We have now done all the preparatory work we can without holding a
1070          * lock or modifying shared state. From here on, inserting the new WAL
1071          * record to the shared WAL buffer cache is a two-step process:
1072          *
1073          * 1. Reserve the right amount of space from the WAL. The current head of
1074          *    reserved space is kept in Insert->CurrBytePos, and is protected by
1075          *    insertpos_lck.
1076          *
1077          * 2. Copy the record to the reserved WAL space. This involves finding the
1078          *    correct WAL buffer containing the reserved space, and copying the
1079          *    record in place. This can be done concurrently in multiple processes.
1080          *
1081          * To keep track of which insertions are still in-progress, each concurrent
1082          * inserter allocates an "insertion slot", which tells others how far the
1083          * inserter has progressed. There is a small fixed number of insertion
1084          * slots, determined by the num_xloginsert_slots GUC. When an inserter
1085          * finishes, it updates the xlogInsertingAt of its slot to the end of the
1086          * record it inserted, to let others know that it's done. xlogInsertingAt
1087          * is also updated when crossing over to a new WAL buffer, to allow the
1088          * the previous buffer to be flushed.
1089          *
1090          * Holding onto a slot also protects RedoRecPtr and fullPageWrites from
1091          * changing until the insertion is finished.
1092          *
1093          * Step 2 can usually be done completely in parallel. If the required WAL
1094          * page is not initialized yet, you have to grab WALBufMappingLock to
1095          * initialize it, but the WAL writer tries to do that ahead of insertions
1096          * to avoid that from happening in the critical path.
1097          *
1098          *----------
1099          */
1100         START_CRIT_SECTION();
1101         WALInsertSlotAcquire(isLogSwitch);
1102
1103         /*
1104          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
1105          * back and recompute everything.  This can only happen just after a
1106          * checkpoint, so it's better to be slow in this case and fast otherwise.
1107          *
1108          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1109          * affect the contents of the XLOG record, so we'll update our local copy
1110          * but not force a recomputation.
1111          */
1112         if (RedoRecPtr != Insert->RedoRecPtr)
1113         {
1114                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1115                 RedoRecPtr = Insert->RedoRecPtr;
1116
1117                 if (doPageWrites)
1118                 {
1119                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1120                         {
1121                                 if (dtbuf[i] == InvalidBuffer)
1122                                         continue;
1123                                 if (dtbuf_bkp[i] == false &&
1124                                         dtbuf_lsn[i] <= RedoRecPtr)
1125                                 {
1126                                         /*
1127                                          * Oops, this buffer now needs to be backed up, but we
1128                                          * didn't think so above.  Start over.
1129                                          */
1130                                         WALInsertSlotRelease();
1131                                         END_CRIT_SECTION();
1132                                         rdt_lastnormal->next = NULL;
1133                                         info = info_orig;
1134                                         goto begin;
1135                                 }
1136                         }
1137                 }
1138         }
1139
1140         /*
1141          * Also check to see if fullPageWrites or forcePageWrites was just turned
1142          * on; if we weren't already doing full-page writes then go back and
1143          * recompute. (If it was just turned off, we could recompute the record
1144          * without full pages, but we choose not to bother.)
1145          */
1146         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
1147         {
1148                 /* Oops, must redo it with full-page data. */
1149                 WALInsertSlotRelease();
1150                 END_CRIT_SECTION();
1151                 rdt_lastnormal->next = NULL;
1152                 info = info_orig;
1153                 goto begin;
1154         }
1155
1156         /*
1157          * Reserve space for the record in the WAL. This also sets the xl_prev
1158          * pointer.
1159          */
1160         if (isLogSwitch)
1161                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1162         else
1163         {
1164                 ReserveXLogInsertLocation(write_len, &StartPos, &EndPos,
1165                                                                   &rechdr->xl_prev);
1166                 inserted = true;
1167         }
1168
1169         if (inserted)
1170         {
1171                 /*
1172                  * Now that xl_prev has been filled in, finish CRC calculation of the
1173                  * record header.
1174                  */
1175                 COMP_CRC32(rdata_crc, ((char *) &rechdr->xl_prev), sizeof(XLogRecPtr));
1176                 FIN_CRC32(rdata_crc);
1177                 rechdr->xl_crc = rdata_crc;
1178
1179                 /*
1180                  * All the record data, including the header, is now ready to be
1181                  * inserted. Copy the record in the space reserved.
1182                  */
1183                 CopyXLogRecordToWAL(write_len, isLogSwitch, &hdr_rdt, StartPos, EndPos);
1184         }
1185         else
1186         {
1187                 /*
1188                  * This was an xlog-switch record, but the current insert location was
1189                  * already exactly at the beginning of a segment, so there was no need
1190                  * to do anything.
1191                  */
1192         }
1193
1194         /*
1195          * Done! Let others know that we're finished.
1196          */
1197         WALInsertSlotRelease();
1198
1199         MarkCurrentTransactionIdLoggedIfAny();
1200
1201         END_CRIT_SECTION();
1202
1203         /*
1204          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1205          */
1206         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1207         {
1208                 /* use volatile pointer to prevent code rearrangement */
1209                 volatile XLogCtlData *xlogctl = XLogCtl;
1210
1211                 SpinLockAcquire(&xlogctl->info_lck);
1212                 /* advance global request to include new block(s) */
1213                 if (xlogctl->LogwrtRqst.Write < EndPos)
1214                         xlogctl->LogwrtRqst.Write = EndPos;
1215                 /* update local result copy while I have the chance */
1216                 LogwrtResult = xlogctl->LogwrtResult;
1217                 SpinLockRelease(&xlogctl->info_lck);
1218         }
1219
1220         /*
1221          * If this was an XLOG_SWITCH record, flush the record and the empty
1222          * padding space that fills the rest of the segment, and perform
1223          * end-of-segment actions (eg, notifying archiver).
1224          */
1225         if (isLogSwitch)
1226         {
1227                 TRACE_POSTGRESQL_XLOG_SWITCH();
1228                 XLogFlush(EndPos);
1229                 /*
1230                  * Even though we reserved the rest of the segment for us, which is
1231                  * reflected in EndPos, we return a pointer to just the end of the
1232                  * xlog-switch record.
1233                  */
1234                 if (inserted)
1235                 {
1236                         EndPos = StartPos + SizeOfXLogRecord;
1237                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1238                         {
1239                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1240                                         EndPos += SizeOfXLogLongPHD;
1241                                 else
1242                                         EndPos += SizeOfXLogShortPHD;
1243                         }
1244                 }
1245         }
1246
1247 #ifdef WAL_DEBUG
1248         if (XLOG_DEBUG)
1249         {
1250                 StringInfoData buf;
1251
1252                 initStringInfo(&buf);
1253                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1254                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1255                 xlog_outrec(&buf, rechdr);
1256                 if (rdata->data != NULL)
1257                 {
1258                         appendStringInfoString(&buf, " - ");
1259                         RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
1260                 }
1261                 elog(LOG, "%s", buf.data);
1262                 pfree(buf.data);
1263         }
1264 #endif
1265
1266         /*
1267          * Update our global variables
1268          */
1269         ProcLastRecPtr = StartPos;
1270         XactLastRecEnd = EndPos;
1271
1272         return EndPos;
1273 }
1274
1275 /*
1276  * Reserves the right amount of space for a record of given size from the WAL.
1277  * *StartPos is set to the beginning of the reserved section, *EndPos to
1278  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1279  * used to set the xl_prev of this record.
1280  *
1281  * This is the performance critical part of XLogInsert that must be serialized
1282  * across backends. The rest can happen mostly in parallel. Try to keep this
1283  * section as short as possible, insertpos_lck can be heavily contended on a
1284  * busy system.
1285  *
1286  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1287  * where we actually copy the record to the reserved space.
1288  */
1289 static void
1290 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1291                                                   XLogRecPtr *PrevPtr)
1292 {
1293         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1294         uint64          startbytepos;
1295         uint64          endbytepos;
1296         uint64          prevbytepos;
1297
1298         size = MAXALIGN(size);
1299
1300         /* All (non xlog-switch) records should contain data. */
1301         Assert(size > SizeOfXLogRecord);
1302
1303         /*
1304          * The duration the spinlock needs to be held is minimized by minimizing
1305          * the calculations that have to be done while holding the lock. The
1306          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1307          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1308          * page headers. The mapping between "usable" byte positions and physical
1309          * positions (XLogRecPtrs) can be done outside the locked region, and
1310          * because the usable byte position doesn't include any headers, reserving
1311          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1312          */
1313         SpinLockAcquire(&Insert->insertpos_lck);
1314
1315         startbytepos = Insert->CurrBytePos;
1316         endbytepos = startbytepos + size;
1317         prevbytepos = Insert->PrevBytePos;
1318         Insert->CurrBytePos = endbytepos;
1319         Insert->PrevBytePos = startbytepos;
1320
1321         SpinLockRelease(&Insert->insertpos_lck);
1322
1323         *StartPos = XLogBytePosToRecPtr(startbytepos);
1324         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1325         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1326
1327         /*
1328          * Check that the conversions between "usable byte positions" and
1329          * XLogRecPtrs work consistently in both directions.
1330          */
1331         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1332         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1333         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1334 }
1335
1336 /*
1337  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1338  *
1339  * A log-switch record is handled slightly differently. The rest of the
1340  * segment will be reserved for this insertion, as indicated by the returned
1341  * *EndPos value. However, if we are already at the beginning of the current
1342  * segment, *StartPos and *EndPos are set to the current location without
1343  * reserving any space, and the function returns false.
1344 */
1345 static bool
1346 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1347 {
1348         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1349         uint64          startbytepos;
1350         uint64          endbytepos;
1351         uint64          prevbytepos;
1352         uint32          size = SizeOfXLogRecord;
1353         XLogRecPtr      ptr;
1354         uint32          segleft;
1355
1356         /*
1357          * These calculations are a bit heavy-weight to be done while holding a
1358          * spinlock, but since we're holding all the WAL insertion slots, there
1359          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1360          * compete for it, but that's not called very frequently.
1361          */
1362         SpinLockAcquire(&Insert->insertpos_lck);
1363
1364         startbytepos = Insert->CurrBytePos;
1365
1366         ptr = XLogBytePosToEndRecPtr(startbytepos);
1367         if (ptr % XLOG_SEG_SIZE == 0)
1368         {
1369                 SpinLockRelease(&Insert->insertpos_lck);
1370                 *EndPos = *StartPos = ptr;
1371                 return false;
1372         }
1373
1374         endbytepos = startbytepos + size;
1375         prevbytepos = Insert->PrevBytePos;
1376
1377         *StartPos = XLogBytePosToRecPtr(startbytepos);
1378         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1379
1380         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1381         if (segleft != XLOG_SEG_SIZE)
1382         {
1383                 /* consume the rest of the segment */
1384                 *EndPos += segleft;
1385                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1386         }
1387         Insert->CurrBytePos = endbytepos;
1388         Insert->PrevBytePos = startbytepos;
1389
1390         SpinLockRelease(&Insert->insertpos_lck);
1391
1392         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1393
1394         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1395         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1396         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1397         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1398
1399         return true;
1400 }
1401
1402 /*
1403  * Subroutine of XLogInsert.  Copies a WAL record to an already-reserved
1404  * area in the WAL.
1405  */
1406 static void
1407 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1408                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1409 {
1410         char       *currpos;
1411         int                     freespace;
1412         int                     written;
1413         XLogRecPtr      CurrPos;
1414         XLogPageHeader pagehdr;
1415
1416         /* The first chunk is the record header */
1417         Assert(rdata->len == SizeOfXLogRecord);
1418
1419         /*
1420          * Get a pointer to the right place in the right WAL buffer to start
1421          * inserting to.
1422          */
1423         CurrPos = StartPos;
1424         currpos = GetXLogBuffer(CurrPos);
1425         freespace = INSERT_FREESPACE(CurrPos);
1426
1427         /*
1428          * there should be enough space for at least the first field (xl_tot_len)
1429          * on this page.
1430          */
1431         Assert(freespace >= sizeof(uint32));
1432
1433         /* Copy record data */
1434         written = 0;
1435         while (rdata != NULL)
1436         {
1437                 char       *rdata_data = rdata->data;
1438                 int                     rdata_len = rdata->len;
1439
1440                 while (rdata_len > freespace)
1441                 {
1442                         /*
1443                          * Write what fits on this page, and continue on the next page.
1444                          */
1445                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1446                         memcpy(currpos, rdata_data, freespace);
1447                         rdata_data += freespace;
1448                         rdata_len -= freespace;
1449                         written += freespace;
1450                         CurrPos += freespace;
1451
1452                         /*
1453                          * Get pointer to beginning of next page, and set the xlp_rem_len
1454                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1455                          *
1456                          * It's safe to set the contrecord flag and xlp_rem_len without a
1457                          * lock on the page. All the other flags were already set when the
1458                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1459                          * only backend that needs to set the contrecord flag.
1460                          */
1461                         currpos = GetXLogBuffer(CurrPos);
1462                         pagehdr = (XLogPageHeader) currpos;
1463                         pagehdr->xlp_rem_len = write_len - written;
1464                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1465
1466                         /* skip over the page header */
1467                         if (CurrPos % XLogSegSize == 0)
1468                         {
1469                                 CurrPos += SizeOfXLogLongPHD;
1470                                 currpos += SizeOfXLogLongPHD;
1471                         }
1472                         else
1473                         {
1474                                 CurrPos += SizeOfXLogShortPHD;
1475                                 currpos += SizeOfXLogShortPHD;
1476                         }
1477                         freespace = INSERT_FREESPACE(CurrPos);
1478                 }
1479
1480                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1481                 memcpy(currpos, rdata_data, rdata_len);
1482                 currpos += rdata_len;
1483                 CurrPos += rdata_len;
1484                 freespace -= rdata_len;
1485                 written += rdata_len;
1486
1487                 rdata = rdata->next;
1488         }
1489         Assert(written == write_len);
1490
1491         /* Align the end position, so that the next record starts aligned */
1492         CurrPos = MAXALIGN64(CurrPos);
1493
1494         /*
1495          * If this was an xlog-switch, it's not enough to write the switch record,
1496          * we also have to consume all the remaining space in the WAL segment.
1497          * We have already reserved it for us, but we still need to make sure it's
1498          * allocated and zeroed in the WAL buffers so that when the caller (or
1499          * someone else) does XLogWrite(), it can really write out all the zeros.
1500          */
1501         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1502         {
1503                 /* An xlog-switch record doesn't contain any data besides the header */
1504                 Assert(write_len == SizeOfXLogRecord);
1505
1506                 /*
1507                  * We do this one page at a time, to make sure we don't deadlock
1508                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1509                  */
1510                 Assert(EndPos % XLogSegSize == 0);
1511
1512                 /* Use up all the remaining space on the first page */
1513                 CurrPos += freespace;
1514
1515                 while (CurrPos < EndPos)
1516                 {
1517                         /* initialize the next page (if not initialized already) */
1518                         WakeupWaiters(CurrPos);
1519                         AdvanceXLInsertBuffer(CurrPos, false);
1520                         CurrPos += XLOG_BLCKSZ;
1521                 }
1522         }
1523
1524         if (CurrPos != EndPos)
1525                 elog(PANIC, "space reserved for WAL record does not match what was written");
1526 }
1527
1528 /*
1529  * Allocate a slot for insertion.
1530  *
1531  * In exclusive mode, all slots are reserved for the current process. That
1532  * blocks all concurrent insertions.
1533  */
1534 static void
1535 WALInsertSlotAcquire(bool exclusive)
1536 {
1537         int                     i;
1538
1539         if (exclusive)
1540         {
1541                 for (i = 0; i < num_xloginsert_slots; i++)
1542                         WALInsertSlotAcquireOne(i);
1543                 holdingAllSlots = true;
1544         }
1545         else
1546                 WALInsertSlotAcquireOne(-1);
1547 }
1548
1549 /*
1550  * Workhorse of WALInsertSlotAcquire. Acquires the given slot, or an arbitrary
1551  * one if slotno == -1. The index of the slot that was acquired is stored in
1552  * MySlotNo.
1553  *
1554  * This is more or less equivalent to LWLockAcquire().
1555  */
1556 static void
1557 WALInsertSlotAcquireOne(int slotno)
1558 {
1559         volatile XLogInsertSlot *slot;
1560         PGPROC     *proc = MyProc;
1561         bool            retry = false;
1562         int                     extraWaits = 0;
1563         static int      slotToTry = -1;
1564
1565         /*
1566          * Try to use the slot we used last time. If the system isn't particularly
1567          * busy, it's a good bet that it's available, and it's good to have some
1568          * affinity to a particular slot so that you don't unnecessarily bounce
1569          * cache lines between processes when there is no contention.
1570          *
1571          * If this is the first time through in this backend, pick a slot
1572          * (semi-)randomly. This allows the slots to be used evenly if you have a
1573          * lot of very short connections.
1574          */
1575         if (slotno != -1)
1576                 MySlotNo = slotno;
1577         else
1578         {
1579                 if (slotToTry == -1)
1580                         slotToTry = MyProc->pgprocno % num_xloginsert_slots;
1581                 MySlotNo = slotToTry;
1582         }
1583
1584         /*
1585          * We can't wait if we haven't got a PGPROC.  This should only occur
1586          * during bootstrap or shared memory initialization.  Put an Assert here
1587          * to catch unsafe coding practices.
1588          */
1589         Assert(MyProc != NULL);
1590
1591         /*
1592          * Lock out cancel/die interrupts until we exit the code section protected
1593          * by the slot.  This ensures that interrupts will not interfere with
1594          * manipulations of data structures in shared memory. There is no cleanup
1595          * mechanism to release the slot if the backend dies while holding one,
1596          * so make this a critical section.
1597          */
1598         START_CRIT_SECTION();
1599
1600         /*
1601          * Loop here to try to acquire slot after each time we are signaled by
1602          * WALInsertSlotRelease.
1603          */
1604         for (;;)
1605         {
1606                 bool            mustwait;
1607
1608                 slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1609
1610                 /* Acquire mutex.  Time spent holding mutex should be short! */
1611                 SpinLockAcquire(&slot->mutex);
1612
1613                 /* If retrying, allow WALInsertSlotRelease to release waiters again */
1614                 if (retry)
1615                         slot->releaseOK = true;
1616
1617                 /* If I can get the slot, do so quickly. */
1618                 if (slot->exclusive == 0)
1619                 {
1620                         slot->exclusive++;
1621                         mustwait = false;
1622                 }
1623                 else
1624                         mustwait = true;
1625
1626                 if (!mustwait)
1627                         break;                          /* got the lock */
1628
1629                 Assert(slot->owner != MyProc);
1630
1631                 /*
1632                  * Add myself to wait queue.
1633                  */
1634                 proc->lwWaiting = true;
1635                 proc->lwWaitMode = LW_EXCLUSIVE;
1636                 proc->lwWaitLink = NULL;
1637                 if (slot->head == NULL)
1638                         slot->head = proc;
1639                 else
1640                         slot->tail->lwWaitLink = proc;
1641                 slot->tail = proc;
1642
1643                 /* Can release the mutex now */
1644                 SpinLockRelease(&slot->mutex);
1645
1646                 /*
1647                  * Wait until awakened.
1648                  *
1649                  * Since we share the process wait semaphore with the regular lock
1650                  * manager and ProcWaitForSignal, and we may need to acquire a slot
1651                  * while one of those is pending, it is possible that we get awakened
1652                  * for a reason other than being signaled by WALInsertSlotRelease. If
1653                  * so, loop back and wait again.  Once we've gotten the slot,
1654                  * re-increment the sema by the number of additional signals received,
1655                  * so that the lock manager or signal manager will see the received
1656                  * signal when it next waits.
1657                  */
1658                 for (;;)
1659                 {
1660                         /* "false" means cannot accept cancel/die interrupt here. */
1661                         PGSemaphoreLock(&proc->sem, false);
1662                         if (!proc->lwWaiting)
1663                                 break;
1664                         extraWaits++;
1665                 }
1666
1667                 /* Now loop back and try to acquire lock again. */
1668                 retry = true;
1669         }
1670
1671         slot->owner = proc;
1672
1673         /*
1674          * Normally, we initialize the xlogInsertingAt value of the slot to 1,
1675          * because we don't yet know where in the WAL we're going to insert. It's
1676          * not critical what it points to right now - leaving it to a too small
1677          * value just means that WaitXlogInsertionsToFinish() might wait on us
1678          * unnecessarily, until we update the value (when we finish the insert or
1679          * move to next page).
1680          *
1681          * If we're grabbing all the slots, however, stamp all but the last one
1682          * with InvalidXLogRecPtr, meaning there is no insert in progress. The last
1683          * slot is the one that we will update as we proceed with the insert, the
1684          * rest are held just to keep off other inserters.
1685          */
1686         if (slotno != -1 && slotno != num_xloginsert_slots - 1)
1687                 slot->xlogInsertingAt = InvalidXLogRecPtr;
1688         else
1689                 slot->xlogInsertingAt = 1;
1690
1691         /* We are done updating shared state of the slot itself. */
1692         SpinLockRelease(&slot->mutex);
1693
1694         /*
1695          * Fix the process wait semaphore's count for any absorbed wakeups.
1696          */
1697         while (extraWaits-- > 0)
1698                 PGSemaphoreUnlock(&proc->sem);
1699
1700         /*
1701          * If we couldn't get the slot immediately, try another slot next time.
1702          * On a system with more insertion slots than concurrent inserters, this
1703          * causes all the inserters to eventually migrate to a slot that no-one
1704          * else is using. On a system with more inserters than slots, it still
1705          * causes the inserters to be distributed quite evenly across the slots.
1706          */
1707         if (slotno != -1 && retry)
1708                 slotToTry = (slotToTry + 1) % num_xloginsert_slots;
1709 }
1710
1711 /*
1712  * Wait for the given slot to become free, or for its xlogInsertingAt location
1713  * to change to something else than 'waitptr'. In other words, wait for the
1714  * inserter using the given slot to finish its insertion, or to at least make
1715  * some progress.
1716  */
1717 static void
1718 WaitOnSlot(volatile XLogInsertSlot *slot, XLogRecPtr waitptr)
1719 {
1720         PGPROC     *proc = MyProc;
1721         int                     extraWaits = 0;
1722
1723         /*
1724          * Lock out cancel/die interrupts while we sleep on the slot. There is
1725          * no cleanup mechanism to remove us from the wait queue if we got
1726          * interrupted.
1727          */
1728         HOLD_INTERRUPTS();
1729
1730         /*
1731          * Loop here to try to acquire lock after each time we are signaled.
1732          */
1733         for (;;)
1734         {
1735                 bool            mustwait;
1736
1737                 /* Acquire mutex.  Time spent holding mutex should be short! */
1738                 SpinLockAcquire(&slot->mutex);
1739
1740                 /* If I can get the lock, do so quickly. */
1741                 if (slot->exclusive == 0 || slot->xlogInsertingAt != waitptr)
1742                         mustwait = false;
1743                 else
1744                         mustwait = true;
1745
1746                 if (!mustwait)
1747                         break;                          /* the lock was free */
1748
1749                 Assert(slot->owner != MyProc);
1750
1751                 /*
1752                  * Add myself to wait queue.
1753                  */
1754                 proc->lwWaiting = true;
1755                 proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
1756                 proc->lwWaitLink = NULL;
1757
1758                 /* waiters are added to the front of the queue */
1759                 proc->lwWaitLink = slot->head;
1760                 if (slot->head == NULL)
1761                         slot->tail = proc;
1762                 slot->head = proc;
1763
1764                 /* Can release the mutex now */
1765                 SpinLockRelease(&slot->mutex);
1766
1767                 /*
1768                  * Wait until awakened.
1769                  *
1770                  * Since we share the process wait semaphore with other things, like
1771                  * the regular lock manager and ProcWaitForSignal, and we may need to
1772                  * acquire an LWLock while one of those is pending, it is possible that
1773                  * we get awakened for a reason other than being signaled by
1774                  * LWLockRelease. If so, loop back and wait again.  Once we've gotten
1775                  * the LWLock, re-increment the sema by the number of additional
1776                  * signals received, so that the lock manager or signal manager will
1777                  * see the received signal when it next waits.
1778                  */
1779                 for (;;)
1780                 {
1781                         /* "false" means cannot accept cancel/die interrupt here. */
1782                         PGSemaphoreLock(&proc->sem, false);
1783                         if (!proc->lwWaiting)
1784                                 break;
1785                         extraWaits++;
1786                 }
1787
1788                 /* Now loop back and try to acquire lock again. */
1789         }
1790
1791         /* We are done updating shared state of the lock itself. */
1792         SpinLockRelease(&slot->mutex);
1793
1794         /*
1795          * Fix the process wait semaphore's count for any absorbed wakeups.
1796          */
1797         while (extraWaits-- > 0)
1798                 PGSemaphoreUnlock(&proc->sem);
1799
1800         /*
1801          * Now okay to allow cancel/die interrupts.
1802          */
1803         RESUME_INTERRUPTS();
1804 }
1805
1806 /*
1807  * Wake up all processes waiting for us with WaitOnSlot(). Sets our
1808  * xlogInsertingAt value to EndPos, without releasing the slot.
1809  */
1810 static void
1811 WakeupWaiters(XLogRecPtr EndPos)
1812 {
1813         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1814         PGPROC     *head;
1815         PGPROC     *proc;
1816         PGPROC     *next;
1817
1818         /*
1819          * If we have already reported progress up to the same point, do nothing.
1820          * No other process can modify xlogInsertingAt, so we can check this before
1821          * grabbing the spinlock.
1822          */
1823         if (slot->xlogInsertingAt == EndPos)
1824                 return;
1825         /* xlogInsertingAt should not go backwards */
1826         Assert(slot->xlogInsertingAt < EndPos);
1827
1828         /* Acquire mutex.  Time spent holding mutex should be short! */
1829         SpinLockAcquire(&slot->mutex);
1830
1831         /* we should own the slot */
1832         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1833
1834         slot->xlogInsertingAt = EndPos;
1835
1836         /*
1837          * See if there are any waiters that need to be woken up.
1838          */
1839         head = slot->head;
1840
1841         if (head != NULL)
1842         {
1843                 proc = head;
1844
1845                 /* LW_WAIT_UNTIL_FREE waiters are always in the front of the queue */
1846                 next = proc->lwWaitLink;
1847                 while (next && next->lwWaitMode == LW_WAIT_UNTIL_FREE)
1848                 {
1849                         proc = next;
1850                         next = next->lwWaitLink;
1851                 }
1852
1853                 /* proc is now the last PGPROC to be released */
1854                 slot->head = next;
1855                 proc->lwWaitLink = NULL;
1856         }
1857
1858         /* We are done updating shared state of the lock itself. */
1859         SpinLockRelease(&slot->mutex);
1860
1861         /*
1862          * Awaken any waiters I removed from the queue.
1863          */
1864         while (head != NULL)
1865         {
1866                 proc = head;
1867                 head = proc->lwWaitLink;
1868                 proc->lwWaitLink = NULL;
1869                 proc->lwWaiting = false;
1870                 PGSemaphoreUnlock(&proc->sem);
1871         }
1872 }
1873
1874 /*
1875  * Release our insertion slot (or slots, if we're holding them all).
1876  */
1877 static void
1878 WALInsertSlotRelease(void)
1879 {
1880         int                     i;
1881
1882         if (holdingAllSlots)
1883         {
1884                 for (i = 0; i < num_xloginsert_slots; i++)
1885                         WALInsertSlotReleaseOne(i);
1886                 holdingAllSlots = false;
1887         }
1888         else
1889                 WALInsertSlotReleaseOne(MySlotNo);
1890 }
1891
1892 static void
1893 WALInsertSlotReleaseOne(int slotno)
1894 {
1895         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[slotno].slot;
1896         PGPROC     *head;
1897         PGPROC     *proc;
1898
1899         /* Acquire mutex.  Time spent holding mutex should be short! */
1900         SpinLockAcquire(&slot->mutex);
1901
1902         /* we must be holding it */
1903         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1904
1905         slot->xlogInsertingAt = InvalidXLogRecPtr;
1906
1907         /* Release my hold on the slot */
1908         slot->exclusive = 0;
1909         slot->owner = NULL;
1910
1911         /*
1912          * See if I need to awaken any waiters..
1913          */
1914         head = slot->head;
1915         if (head != NULL)
1916         {
1917                 if (slot->releaseOK)
1918                 {
1919                         /*
1920                          * Remove the to-be-awakened PGPROCs from the queue.
1921                          */
1922                         bool            releaseOK = true;
1923
1924                         proc = head;
1925
1926                         /*
1927                          * First wake up any backends that want to be woken up without
1928                          * acquiring the lock. These are always in the front of the queue.
1929                          */
1930                         while (proc->lwWaitMode == LW_WAIT_UNTIL_FREE && proc->lwWaitLink)
1931                                 proc = proc->lwWaitLink;
1932
1933                         /*
1934                          * Awaken the first exclusive-waiter, if any.
1935                          */
1936                         if (proc->lwWaitLink)
1937                         {
1938                                 Assert(proc->lwWaitLink->lwWaitMode == LW_EXCLUSIVE);
1939                                 proc = proc->lwWaitLink;
1940                                 releaseOK = false;
1941                         }
1942                         /* proc is now the last PGPROC to be released */
1943                         slot->head = proc->lwWaitLink;
1944                         proc->lwWaitLink = NULL;
1945
1946                         slot->releaseOK = releaseOK;
1947                 }
1948                 else
1949                         head = NULL;
1950         }
1951
1952         /* We are done updating shared state of the slot itself. */
1953         SpinLockRelease(&slot->mutex);
1954
1955         /*
1956          * Awaken any waiters I removed from the queue.
1957          */
1958         while (head != NULL)
1959         {
1960                 proc = head;
1961                 head = proc->lwWaitLink;
1962                 proc->lwWaitLink = NULL;
1963                 proc->lwWaiting = false;
1964                 PGSemaphoreUnlock(&proc->sem);
1965         }
1966
1967         /*
1968          * Now okay to allow cancel/die interrupts.
1969          */
1970         END_CRIT_SECTION();
1971 }
1972
1973
1974 /*
1975  * Wait for any WAL insertions < upto to finish.
1976  *
1977  * Returns the location of the oldest insertion that is still in-progress.
1978  * Any WAL prior to that point has been fully copied into WAL buffers, and
1979  * can be flushed out to disk. Because this waits for any insertions older
1980  * than 'upto' to finish, the return value is always >= 'upto'.
1981  *
1982  * Note: When you are about to write out WAL, you must call this function
1983  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1984  * need to wait for an insertion to finish (or at least advance to next
1985  * uninitialized page), and the inserter might need to evict an old WAL buffer
1986  * to make room for a new one, which in turn requires WALWriteLock.
1987  */
1988 static XLogRecPtr
1989 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1990 {
1991         uint64          bytepos;
1992         XLogRecPtr      reservedUpto;
1993         XLogRecPtr      finishedUpto;
1994         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1995         int                     i;
1996
1997         if (MyProc == NULL)
1998                 elog(PANIC, "cannot wait without a PGPROC structure");
1999
2000         /* Read the current insert position */
2001         SpinLockAcquire(&Insert->insertpos_lck);
2002         bytepos = Insert->CurrBytePos;
2003         SpinLockRelease(&Insert->insertpos_lck);
2004         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
2005
2006         /*
2007          * No-one should request to flush a piece of WAL that hasn't even been
2008          * reserved yet. However, it can happen if there is a block with a bogus
2009          * LSN on disk, for example. XLogFlush checks for that situation and
2010          * complains, but only after the flush. Here we just assume that to mean
2011          * that all WAL that has been reserved needs to be finished. In this
2012          * corner-case, the return value can be smaller than 'upto' argument.
2013          */
2014         if (upto > reservedUpto)
2015         {
2016                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
2017                          (uint32) (upto >> 32), (uint32) upto,
2018                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
2019                 upto = reservedUpto;
2020         }
2021
2022         /*
2023          * finishedUpto is our return value, indicating the point upto which
2024          * all the WAL insertions have been finished. Initialize it to the head
2025          * of reserved WAL, and as we iterate through the insertion slots, back it
2026          * out for any insertion that's still in progress.
2027          */
2028         finishedUpto = reservedUpto;
2029
2030         /*
2031          * Loop through all the slots, sleeping on any in-progress insert older
2032          * than 'upto'.
2033          */
2034         for (i = 0; i < num_xloginsert_slots; i++)
2035         {
2036                 volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
2037                 XLogRecPtr insertingat;
2038
2039         retry:
2040                 /*
2041                  * We can check if the slot is in use without grabbing the spinlock.
2042                  * The spinlock acquisition of insertpos_lck before this loop acts
2043                  * as a memory barrier. If someone acquires the slot after that, it
2044                  * can't possibly be inserting to anything < reservedUpto. If it was
2045                  * acquired before that, an unlocked test will return true.
2046                  */
2047                 if (!slot->exclusive)
2048                         continue;
2049
2050                 SpinLockAcquire(&slot->mutex);
2051                 /* re-check now that we have the lock */
2052                 if (!slot->exclusive)
2053                 {
2054                         SpinLockRelease(&slot->mutex);
2055                         continue;
2056                 }
2057                 insertingat = slot->xlogInsertingAt;
2058                 SpinLockRelease(&slot->mutex);
2059
2060                 if (insertingat == InvalidXLogRecPtr)
2061                 {
2062                         /*
2063                          * slot is reserved just to hold off other inserters, there is no
2064                          * actual insert in progress.
2065                          */
2066                         continue;
2067                 }
2068
2069                 /*
2070                  * This insertion is still in progress. Do we need to wait for it?
2071                  *
2072                  * When an inserter acquires a slot, it doesn't reset 'insertingat', so
2073                  * it will initially point to the old value of some already-finished
2074                  * insertion. The inserter will update the value as soon as it finishes
2075                  * the insertion, moves to the next page, or has to do I/O to flush an
2076                  * old dirty buffer. That means that when we see a slot with
2077                  * insertingat value < upto, we don't know if that insertion is still
2078                  * truly in progress, or if the slot is reused by a new inserter that
2079                  * hasn't updated the insertingat value yet. We have to assume it's the
2080                  * latter, and wait.
2081                  */
2082                 if (insertingat < upto)
2083                 {
2084                         WaitOnSlot(slot, insertingat);
2085                         goto retry;
2086                 }
2087                 else
2088                 {
2089                         /*
2090                          * We don't need to wait for this insertion, but update the
2091                          * return value.
2092                          */
2093                         if (insertingat < finishedUpto)
2094                                 finishedUpto = insertingat;
2095                 }
2096         }
2097         return finishedUpto;
2098 }
2099
2100 /*
2101  * Get a pointer to the right location in the WAL buffer containing the
2102  * given XLogRecPtr.
2103  *
2104  * If the page is not initialized yet, it is initialized. That might require
2105  * evicting an old dirty buffer from the buffer cache, which means I/O.
2106  *
2107  * The caller must ensure that the page containing the requested location
2108  * isn't evicted yet, and won't be evicted. The way to ensure that is to
2109  * hold onto an XLogInsertSlot with the xlogInsertingAt position set to
2110  * something <= ptr. GetXLogBuffer() will update xlogInsertingAt if it needs
2111  * to evict an old page from the buffer. (This means that once you call
2112  * GetXLogBuffer() with a given 'ptr', you must not access anything before
2113  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
2114  * later, because older buffers might be recycled already)
2115  */
2116 static char *
2117 GetXLogBuffer(XLogRecPtr ptr)
2118 {
2119         int                     idx;
2120         XLogRecPtr      endptr;
2121         static uint64 cachedPage = 0;
2122         static char *cachedPos = NULL;
2123         XLogRecPtr      expectedEndPtr;
2124
2125         /*
2126          * Fast path for the common case that we need to access again the same
2127          * page as last time.
2128          */
2129         if (ptr / XLOG_BLCKSZ == cachedPage)
2130         {
2131                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2132                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2133                 return cachedPos + ptr % XLOG_BLCKSZ;
2134         }
2135
2136         /*
2137          * The XLog buffer cache is organized so that a page is always loaded
2138          * to a particular buffer.  That way we can easily calculate the buffer
2139          * a given page must be loaded into, from the XLogRecPtr alone.
2140          */
2141         idx = XLogRecPtrToBufIdx(ptr);
2142
2143         /*
2144          * See what page is loaded in the buffer at the moment. It could be the
2145          * page we're looking for, or something older. It can't be anything newer
2146          * - that would imply the page we're looking for has already been written
2147          * out to disk and evicted, and the caller is responsible for making sure
2148          * that doesn't happen.
2149          *
2150          * However, we don't hold a lock while we read the value. If someone has
2151          * just initialized the page, it's possible that we get a "torn read" of
2152          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
2153          * that case we will see a bogus value. That's ok, we'll grab the mapping
2154          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
2155          * the page we're looking for. But it means that when we do this unlocked
2156          * read, we might see a value that appears to be ahead of the page we're
2157          * looking for. Don't PANIC on that, until we've verified the value while
2158          * holding the lock.
2159          */
2160         expectedEndPtr = ptr;
2161         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
2162
2163         endptr = XLogCtl->xlblocks[idx];
2164         if (expectedEndPtr != endptr)
2165         {
2166                 /*
2167                  * Let others know that we're finished inserting the record up
2168                  * to the page boundary.
2169                  */
2170                 WakeupWaiters(expectedEndPtr - XLOG_BLCKSZ);
2171
2172                 AdvanceXLInsertBuffer(ptr, false);
2173                 endptr = XLogCtl->xlblocks[idx];
2174
2175                 if (expectedEndPtr != endptr)
2176                         elog(PANIC, "could not find WAL buffer for %X/%X",
2177                                  (uint32) (ptr >> 32) , (uint32) ptr);
2178         }
2179         else
2180         {
2181                 /*
2182                  * Make sure the initialization of the page is visible to us, and
2183                  * won't arrive later to overwrite the WAL data we write on the page.
2184                  */
2185                 pg_memory_barrier();
2186         }
2187
2188         /*
2189          * Found the buffer holding this page. Return a pointer to the right
2190          * offset within the page.
2191          */
2192         cachedPage = ptr / XLOG_BLCKSZ;
2193         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
2194
2195         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2196         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2197
2198         return cachedPos + ptr % XLOG_BLCKSZ;
2199 }
2200
2201 /*
2202  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2203  * is the position starting from the beginning of WAL, excluding all WAL
2204  * page headers.
2205  */
2206 static XLogRecPtr
2207 XLogBytePosToRecPtr(uint64 bytepos)
2208 {
2209         uint64          fullsegs;
2210         uint64          fullpages;
2211         uint64          bytesleft;
2212         uint32          seg_offset;
2213         XLogRecPtr      result;
2214
2215         fullsegs = bytepos / UsableBytesInSegment;
2216         bytesleft = bytepos % UsableBytesInSegment;
2217
2218         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2219         {
2220                 /* fits on first page of segment */
2221                 seg_offset = bytesleft + SizeOfXLogLongPHD;
2222         }
2223         else
2224         {
2225                 /* account for the first page on segment with long header */
2226                 seg_offset = XLOG_BLCKSZ;
2227                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2228
2229                 fullpages = bytesleft / UsableBytesInPage;
2230                 bytesleft = bytesleft % UsableBytesInPage;
2231
2232                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2233         }
2234
2235         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2236
2237         return result;
2238 }
2239
2240 /*
2241  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2242  * returns a pointer to the beginning of the page (ie. before page header),
2243  * not to where the first xlog record on that page would go to. This is used
2244  * when converting a pointer to the end of a record.
2245  */
2246 static XLogRecPtr
2247 XLogBytePosToEndRecPtr(uint64 bytepos)
2248 {
2249         uint64          fullsegs;
2250         uint64          fullpages;
2251         uint64          bytesleft;
2252         uint32          seg_offset;
2253         XLogRecPtr      result;
2254
2255         fullsegs = bytepos / UsableBytesInSegment;
2256         bytesleft = bytepos % UsableBytesInSegment;
2257
2258         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2259         {
2260                 /* fits on first page of segment */
2261                 if (bytesleft == 0)
2262                         seg_offset = 0;
2263                 else
2264                         seg_offset = bytesleft + SizeOfXLogLongPHD;
2265         }
2266         else
2267         {
2268                 /* account for the first page on segment with long header */
2269                 seg_offset = XLOG_BLCKSZ;
2270                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2271
2272                 fullpages = bytesleft / UsableBytesInPage;
2273                 bytesleft = bytesleft % UsableBytesInPage;
2274
2275                 if (bytesleft == 0)
2276                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2277                 else
2278                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2279         }
2280
2281         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2282
2283         return result;
2284 }
2285
2286 /*
2287  * Convert an XLogRecPtr to a "usable byte position".
2288  */
2289 static uint64
2290 XLogRecPtrToBytePos(XLogRecPtr ptr)
2291 {
2292         uint64          fullsegs;
2293         uint32          fullpages;
2294         uint32          offset;
2295         uint64          result;
2296
2297         XLByteToSeg(ptr, fullsegs);
2298
2299         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
2300         offset = ptr % XLOG_BLCKSZ;
2301
2302         if (fullpages == 0)
2303         {
2304                 result = fullsegs * UsableBytesInSegment;
2305                 if (offset > 0)
2306                 {
2307                         Assert(offset >= SizeOfXLogLongPHD);
2308                         result += offset - SizeOfXLogLongPHD;
2309                 }
2310         }
2311         else
2312         {
2313                 result = fullsegs * UsableBytesInSegment +
2314                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) +  /* account for first page */
2315                         (fullpages - 1) * UsableBytesInPage; /* full pages */
2316                 if (offset > 0)
2317                 {
2318                         Assert(offset >= SizeOfXLogShortPHD);
2319                         result += offset - SizeOfXLogShortPHD;
2320                 }
2321         }
2322
2323         return result;
2324 }
2325
2326 /*
2327  * Determine whether the buffer referenced by an XLogRecData item has to
2328  * be backed up, and if so fill a BkpBlock struct for it.  In any case
2329  * save the buffer's LSN at *lsn.
2330  */
2331 static bool
2332 XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
2333                                 XLogRecPtr *lsn, BkpBlock *bkpb)
2334 {
2335         Page            page;
2336
2337         page = BufferGetPage(rdata->buffer);
2338
2339         /*
2340          * We assume page LSN is first data on *every* page that can be passed to
2341          * XLogInsert, whether it has the standard page layout or not. We don't
2342          * need to take the buffer header lock for PageGetLSN if we hold an
2343          * exclusive lock on the page and/or the relation.
2344          */
2345         if (holdsExclusiveLock)
2346                 *lsn = PageGetLSN(page);
2347         else
2348                 *lsn = BufferGetLSNAtomic(rdata->buffer);
2349
2350         if (*lsn <= RedoRecPtr)
2351         {
2352                 /*
2353                  * The page needs to be backed up, so set up *bkpb
2354                  */
2355                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
2356
2357                 if (rdata->buffer_std)
2358                 {
2359                         /* Assume we can omit data between pd_lower and pd_upper */
2360                         uint16          lower = ((PageHeader) page)->pd_lower;
2361                         uint16          upper = ((PageHeader) page)->pd_upper;
2362
2363                         if (lower >= SizeOfPageHeaderData &&
2364                                 upper > lower &&
2365                                 upper <= BLCKSZ)
2366                         {
2367                                 bkpb->hole_offset = lower;
2368                                 bkpb->hole_length = upper - lower;
2369                         }
2370                         else
2371                         {
2372                                 /* No "hole" to compress out */
2373                                 bkpb->hole_offset = 0;
2374                                 bkpb->hole_length = 0;
2375                         }
2376                 }
2377                 else
2378                 {
2379                         /* Not a standard page header, don't try to eliminate "hole" */
2380                         bkpb->hole_offset = 0;
2381                         bkpb->hole_length = 0;
2382                 }
2383
2384                 return true;                    /* buffer requires backup */
2385         }
2386
2387         return false;                           /* buffer does not need to be backed up */
2388 }
2389
2390 /*
2391  * Initialize XLOG buffers, writing out old buffers if they still contain
2392  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2393  * true, initialize as many pages as we can without having to write out
2394  * unwritten data. Any new pages are initialized to zeros, with pages headers
2395  * initialized properly.
2396  */
2397 static void
2398 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2399 {
2400         XLogCtlInsert *Insert = &XLogCtl->Insert;
2401         int                     nextidx;
2402         XLogRecPtr      OldPageRqstPtr;
2403         XLogwrtRqst WriteRqst;
2404         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2405         XLogRecPtr      NewPageBeginPtr;
2406         XLogPageHeader NewPage;
2407         int                     npages = 0;
2408
2409         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2410
2411         /*
2412          * Now that we have the lock, check if someone initialized the page
2413          * already.
2414          */
2415         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2416         {
2417                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2418
2419                 /*
2420                  * Get ending-offset of the buffer page we need to replace (this may
2421                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2422                  * already written out.
2423                  */
2424                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2425                 if (LogwrtResult.Write < OldPageRqstPtr)
2426                 {
2427                         /*
2428                          * Nope, got work to do. If we just want to pre-initialize as much
2429                          * as we can without flushing, give up now.
2430                          */
2431                         if (opportunistic)
2432                                 break;
2433
2434                         /* Before waiting, get info_lck and update LogwrtResult */
2435                         {
2436                                 /* use volatile pointer to prevent code rearrangement */
2437                                 volatile XLogCtlData *xlogctl = XLogCtl;
2438
2439                                 SpinLockAcquire(&xlogctl->info_lck);
2440                                 if (xlogctl->LogwrtRqst.Write < OldPageRqstPtr)
2441                                         xlogctl->LogwrtRqst.Write = OldPageRqstPtr;
2442                                 LogwrtResult = xlogctl->LogwrtResult;
2443                                 SpinLockRelease(&xlogctl->info_lck);
2444                         }
2445
2446                         /*
2447                          * Now that we have an up-to-date LogwrtResult value, see if we
2448                          * still need to write it or if someone else already did.
2449                          */
2450                         if (LogwrtResult.Write < OldPageRqstPtr)
2451                         {
2452                                 /*
2453                                  * Must acquire write lock. Release WALBufMappingLock first,
2454                                  * to make sure that all insertions that we need to wait for
2455                                  * can finish (up to this same position). Otherwise we risk
2456                                  * deadlock.
2457                                  */
2458                                 LWLockRelease(WALBufMappingLock);
2459
2460                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2461
2462                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2463
2464                                 LogwrtResult = XLogCtl->LogwrtResult;
2465                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2466                                 {
2467                                         /* OK, someone wrote it already */
2468                                         LWLockRelease(WALWriteLock);
2469                                 }
2470                                 else
2471                                 {
2472                                         /* Have to write it ourselves */
2473                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2474                                         WriteRqst.Write = OldPageRqstPtr;
2475                                         WriteRqst.Flush = 0;
2476                                         XLogWrite(WriteRqst, false);
2477                                         LWLockRelease(WALWriteLock);
2478                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2479                                 }
2480                                 /* Re-acquire WALBufMappingLock and retry */
2481                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2482                                 continue;
2483                         }
2484                 }
2485
2486                 /*
2487                  * Now the next buffer slot is free and we can set it up to be the next
2488                  * output page.
2489                  */
2490                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2491                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2492
2493                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2494
2495                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2496
2497                 /*
2498                  * Be sure to re-zero the buffer so that bytes beyond what we've
2499                  * written will look like zeroes and not valid XLOG records...
2500                  */
2501                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2502
2503                 /*
2504                  * Fill the new page's header
2505                  */
2506                 NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
2507
2508                 /* NewPage->xlp_info = 0; */    /* done by memset */
2509                 NewPage   ->xlp_tli = ThisTimeLineID;
2510                 NewPage   ->xlp_pageaddr = NewPageBeginPtr;
2511                 /* NewPage->xlp_rem_len = 0; */         /* done by memset */
2512
2513                 /*
2514                  * If online backup is not in progress, mark the header to indicate
2515                  * that* WAL records beginning in this page have removable backup
2516                  * blocks.  This allows the WAL archiver to know whether it is safe to
2517                  * compress archived WAL data by transforming full-block records into
2518                  * the non-full-block format.  It is sufficient to record this at the
2519                  * page level because we force a page switch (in fact a segment switch)
2520                  * when starting a backup, so the flag will be off before any records
2521                  * can be written during the backup.  At the end of a backup, the last
2522                  * page will be marked as all unsafe when perhaps only part is unsafe,
2523                  * but at worst the archiver would miss the opportunity to compress a
2524                  * few records.
2525                  */
2526                 if (!Insert->forcePageWrites)
2527                         NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
2528
2529                 /*
2530                  * If first page of an XLOG segment file, make it a long header.
2531                  */
2532                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2533                 {
2534                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2535
2536                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2537                         NewLongPage->xlp_seg_size = XLogSegSize;
2538                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2539                         NewPage   ->xlp_info |= XLP_LONG_HEADER;
2540                 }
2541
2542                 /*
2543                  * Make sure the initialization of the page becomes visible to others
2544                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2545                  * holding a lock.
2546                  */
2547                 pg_write_barrier();
2548
2549                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2550
2551                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2552
2553                 npages++;
2554         }
2555         LWLockRelease(WALBufMappingLock);
2556
2557 #ifdef WAL_DEBUG
2558         if (npages > 0)
2559         {
2560                 elog(DEBUG1, "initialized %d pages, upto %X/%X",
2561                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2562         }
2563 #endif
2564 }
2565
2566 /*
2567  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2568  *
2569  * new_segno indicates a log file that has just been filled up (or read
2570  * during recovery). We measure the distance from RedoRecPtr to new_segno
2571  * and see if that exceeds CheckPointSegments.
2572  *
2573  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2574  */
2575 static bool
2576 XLogCheckpointNeeded(XLogSegNo new_segno)
2577 {
2578         XLogSegNo       old_segno;
2579
2580         XLByteToSeg(RedoRecPtr, old_segno);
2581
2582         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2583                 return true;
2584         return false;
2585 }
2586
2587 /*
2588  * Write and/or fsync the log at least as far as WriteRqst indicates.
2589  *
2590  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2591  * may stop at any convenient boundary (such as a cache or logfile boundary).
2592  * This option allows us to avoid uselessly issuing multiple writes when a
2593  * single one would do.
2594  *
2595  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2596  * must be called before grabbing the lock, to make sure the data is ready to
2597  * write.
2598  */
2599 static void
2600 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2601 {
2602         bool            ispartialpage;
2603         bool            last_iteration;
2604         bool            finishing_seg;
2605         bool            use_existent;
2606         int                     curridx;
2607         int                     npages;
2608         int                     startidx;
2609         uint32          startoffset;
2610
2611         /* We should always be inside a critical section here */
2612         Assert(CritSectionCount > 0);
2613
2614         /*
2615          * Update local LogwrtResult (caller probably did this already, but...)
2616          */
2617         LogwrtResult = XLogCtl->LogwrtResult;
2618
2619         /*
2620          * Since successive pages in the xlog cache are consecutively allocated,
2621          * we can usually gather multiple pages together and issue just one
2622          * write() call.  npages is the number of pages we have determined can be
2623          * written together; startidx is the cache block index of the first one,
2624          * and startoffset is the file offset at which it should go. The latter
2625          * two variables are only valid when npages > 0, but we must initialize
2626          * all of them to keep the compiler quiet.
2627          */
2628         npages = 0;
2629         startidx = 0;
2630         startoffset = 0;
2631
2632         /*
2633          * Within the loop, curridx is the cache block index of the page to
2634          * consider writing.  Begin at the buffer containing the next unwritten
2635          * page, or last partially written page.
2636          */
2637         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2638
2639         while (LogwrtResult.Write < WriteRqst.Write)
2640         {
2641                 /*
2642                  * Make sure we're not ahead of the insert process.  This could happen
2643                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2644                  * last page that's been initialized by AdvanceXLInsertBuffer.
2645                  */
2646                 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2647                 if (LogwrtResult.Write >= EndPtr)
2648                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2649                                  (uint32) (LogwrtResult.Write >> 32),
2650                                  (uint32) LogwrtResult.Write,
2651                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2652
2653                 /* Advance LogwrtResult.Write to end of current buffer page */
2654                 LogwrtResult.Write = EndPtr;
2655                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2656
2657                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2658                 {
2659                         /*
2660                          * Switch to new logfile segment.  We cannot have any pending
2661                          * pages here (since we dump what we have at segment end).
2662                          */
2663                         Assert(npages == 0);
2664                         if (openLogFile >= 0)
2665                                 XLogFileClose();
2666                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2667
2668                         /* create/use new log file */
2669                         use_existent = true;
2670                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2671                         openLogOff = 0;
2672                 }
2673
2674                 /* Make sure we have the current logfile open */
2675                 if (openLogFile < 0)
2676                 {
2677                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2678                         openLogFile = XLogFileOpen(openLogSegNo);
2679                         openLogOff = 0;
2680                 }
2681
2682                 /* Add current page to the set of pending pages-to-dump */
2683                 if (npages == 0)
2684                 {
2685                         /* first of group */
2686                         startidx = curridx;
2687                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2688                 }
2689                 npages++;
2690
2691                 /*
2692                  * Dump the set if this will be the last loop iteration, or if we are
2693                  * at the last page of the cache area (since the next page won't be
2694                  * contiguous in memory), or if we are at the end of the logfile
2695                  * segment.
2696                  */
2697                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2698
2699                 finishing_seg = !ispartialpage &&
2700                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2701
2702                 if (last_iteration ||
2703                         curridx == XLogCtl->XLogCacheBlck ||
2704                         finishing_seg)
2705                 {
2706                         char       *from;
2707                         Size            nbytes;
2708                         Size            nleft;
2709                         int                     written;
2710
2711                         /* Need to seek in the file? */
2712                         if (openLogOff != startoffset)
2713                         {
2714                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2715                                         ereport(PANIC,
2716                                                         (errcode_for_file_access(),
2717                                          errmsg("could not seek in log file %s to offset %u: %m",
2718                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2719                                                         startoffset)));
2720                                 openLogOff = startoffset;
2721                         }
2722
2723                         /* OK to write the page(s) */
2724                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2725                         nbytes = npages * (Size) XLOG_BLCKSZ;
2726                         nleft = nbytes;
2727                         do
2728                         {
2729                                 errno = 0;
2730                                 written  = write(openLogFile, from, nleft);
2731                                 if (written <= 0)
2732                                 {
2733                                         if (errno == EINTR)
2734                                                 continue;
2735                                         ereport(PANIC,
2736                                                         (errcode_for_file_access(),
2737                                                          errmsg("could not write to log file %s "
2738                                                                         "at offset %u, length %lu: %m",
2739                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2740                                                                         openLogOff, (unsigned long) nbytes)));
2741                                 }
2742                                 nleft -= written;
2743                                 from += written;
2744                         } while (nleft > 0);
2745
2746                         /* Update state for write */
2747                         openLogOff += nbytes;
2748                         npages = 0;
2749
2750                         /*
2751                          * If we just wrote the whole last page of a logfile segment,
2752                          * fsync the segment immediately.  This avoids having to go back
2753                          * and re-open prior segments when an fsync request comes along
2754                          * later. Doing it here ensures that one and only one backend will
2755                          * perform this fsync.
2756                          *
2757                          * This is also the right place to notify the Archiver that the
2758                          * segment is ready to copy to archival storage, and to update the
2759                          * timer for archive_timeout, and to signal for a checkpoint if
2760                          * too many logfile segments have been used since the last
2761                          * checkpoint.
2762                          */
2763                         if (finishing_seg)
2764                         {
2765                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2766
2767                                 /* signal that we need to wakeup walsenders later */
2768                                 WalSndWakeupRequest();
2769
2770                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2771
2772                                 if (XLogArchivingActive())
2773                                         XLogArchiveNotifySeg(openLogSegNo);
2774
2775                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2776
2777                                 /*
2778                                  * Request a checkpoint if we've consumed too much xlog since
2779                                  * the last one.  For speed, we first check using the local
2780                                  * copy of RedoRecPtr, which might be out of date; if it looks
2781                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2782                                  * recheck.
2783                                  */
2784                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2785                                 {
2786                                         (void) GetRedoRecPtr();
2787                                         if (XLogCheckpointNeeded(openLogSegNo))
2788                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2789                                 }
2790                         }
2791                 }
2792
2793                 if (ispartialpage)
2794                 {
2795                         /* Only asked to write a partial page */
2796                         LogwrtResult.Write = WriteRqst.Write;
2797                         break;
2798                 }
2799                 curridx = NextBufIdx(curridx);
2800
2801                 /* If flexible, break out of loop as soon as we wrote something */
2802                 if (flexible && npages == 0)
2803                         break;
2804         }
2805
2806         Assert(npages == 0);
2807
2808         /*
2809          * If asked to flush, do so
2810          */
2811         if (LogwrtResult.Flush < WriteRqst.Flush &&
2812                 LogwrtResult.Flush < LogwrtResult.Write)
2813
2814         {
2815                 /*
2816                  * Could get here without iterating above loop, in which case we might
2817                  * have no open file or the wrong one.  However, we do not need to
2818                  * fsync more than one file.
2819                  */
2820                 if (sync_method != SYNC_METHOD_OPEN &&
2821                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2822                 {
2823                         if (openLogFile >= 0 &&
2824                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2825                                 XLogFileClose();
2826                         if (openLogFile < 0)
2827                         {
2828                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2829                                 openLogFile = XLogFileOpen(openLogSegNo);
2830                                 openLogOff = 0;
2831                         }
2832
2833                         issue_xlog_fsync(openLogFile, openLogSegNo);
2834                 }
2835
2836                 /* signal that we need to wakeup walsenders later */
2837                 WalSndWakeupRequest();
2838
2839                 LogwrtResult.Flush = LogwrtResult.Write;
2840         }
2841
2842         /*
2843          * Update shared-memory status
2844          *
2845          * We make sure that the shared 'request' values do not fall behind the
2846          * 'result' values.  This is not absolutely essential, but it saves some
2847          * code in a couple of places.
2848          */
2849         {
2850                 /* use volatile pointer to prevent code rearrangement */
2851                 volatile XLogCtlData *xlogctl = XLogCtl;
2852
2853                 SpinLockAcquire(&xlogctl->info_lck);
2854                 xlogctl->LogwrtResult = LogwrtResult;
2855                 if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
2856                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
2857                 if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
2858                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
2859                 SpinLockRelease(&xlogctl->info_lck);
2860         }
2861 }
2862
2863 /*
2864  * Record the LSN for an asynchronous transaction commit/abort
2865  * and nudge the WALWriter if there is work for it to do.
2866  * (This should not be called for synchronous commits.)
2867  */
2868 void
2869 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2870 {
2871         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2872         bool            sleeping;
2873
2874         /* use volatile pointer to prevent code rearrangement */
2875         volatile XLogCtlData *xlogctl = XLogCtl;
2876
2877         SpinLockAcquire(&xlogctl->info_lck);
2878         LogwrtResult = xlogctl->LogwrtResult;
2879         sleeping = xlogctl->WalWriterSleeping;
2880         if (xlogctl->asyncXactLSN < asyncXactLSN)
2881                 xlogctl->asyncXactLSN = asyncXactLSN;
2882         SpinLockRelease(&xlogctl->info_lck);
2883
2884         /*
2885          * If the WALWriter is sleeping, we should kick it to make it come out of
2886          * low-power mode.      Otherwise, determine whether there's a full page of
2887          * WAL available to write.
2888          */
2889         if (!sleeping)
2890         {
2891                 /* back off to last completed page boundary */
2892                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2893
2894                 /* if we have already flushed that far, we're done */
2895                 if (WriteRqstPtr <= LogwrtResult.Flush)
2896                         return;
2897         }
2898
2899         /*
2900          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2901          * to come out of low-power mode so that this async commit will reach disk
2902          * within the expected amount of time.
2903          */
2904         if (ProcGlobal->walwriterLatch)
2905                 SetLatch(ProcGlobal->walwriterLatch);
2906 }
2907
2908 /*
2909  * Advance minRecoveryPoint in control file.
2910  *
2911  * If we crash during recovery, we must reach this point again before the
2912  * database is consistent.
2913  *
2914  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2915  * is only updated if it's not already greater than or equal to 'lsn'.
2916  */
2917 static void
2918 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2919 {
2920         /* Quick check using our local copy of the variable */
2921         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2922                 return;
2923
2924         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2925
2926         /* update local copy */
2927         minRecoveryPoint = ControlFile->minRecoveryPoint;
2928         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2929
2930         /*
2931          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2932          * i.e., we're doing crash recovery.  We never modify the control file's
2933          * value in that case, so we can short-circuit future checks here too.
2934          */
2935         if (minRecoveryPoint == 0)
2936                 updateMinRecoveryPoint = false;
2937         else if (force || minRecoveryPoint < lsn)
2938         {
2939                 /* use volatile pointer to prevent code rearrangement */
2940                 volatile XLogCtlData *xlogctl = XLogCtl;
2941                 XLogRecPtr      newMinRecoveryPoint;
2942                 TimeLineID      newMinRecoveryPointTLI;
2943
2944                 /*
2945                  * To avoid having to update the control file too often, we update it
2946                  * all the way to the last record being replayed, even though 'lsn'
2947                  * would suffice for correctness.  This also allows the 'force' case
2948                  * to not need a valid 'lsn' value.
2949                  *
2950                  * Another important reason for doing it this way is that the passed
2951                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2952                  * the caller got it from a corrupted heap page.  Accepting such a
2953                  * value as the min recovery point would prevent us from coming up at
2954                  * all.  Instead, we just log a warning and continue with recovery.
2955                  * (See also the comments about corrupt LSNs in XLogFlush.)
2956                  */
2957                 SpinLockAcquire(&xlogctl->info_lck);
2958                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
2959                 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
2960                 SpinLockRelease(&xlogctl->info_lck);
2961
2962                 if (!force && newMinRecoveryPoint < lsn)
2963                         elog(WARNING,
2964                            "xlog min recovery request %X/%X is past current point %X/%X",
2965                                  (uint32) (lsn >> 32), (uint32) lsn,
2966                                  (uint32) (newMinRecoveryPoint >> 32),
2967                                  (uint32) newMinRecoveryPoint);
2968
2969                 /* update control file */
2970                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2971                 {
2972                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2973                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2974                         UpdateControlFile();
2975                         minRecoveryPoint = newMinRecoveryPoint;
2976                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2977
2978                         ereport(DEBUG2,
2979                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
2980                                                 (uint32) (minRecoveryPoint >> 32),
2981                                                 (uint32) minRecoveryPoint,
2982                                                 newMinRecoveryPointTLI)));
2983                 }
2984         }
2985         LWLockRelease(ControlFileLock);
2986 }
2987
2988 /*
2989  * Ensure that all XLOG data through the given position is flushed to disk.
2990  *
2991  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2992  * already held, and we try to avoid acquiring it if possible.
2993  */
2994 void
2995 XLogFlush(XLogRecPtr record)
2996 {
2997         XLogRecPtr      WriteRqstPtr;
2998         XLogwrtRqst WriteRqst;
2999
3000         /*
3001          * During REDO, we are reading not writing WAL.  Therefore, instead of
3002          * trying to flush the WAL, we should update minRecoveryPoint instead. We
3003          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
3004          * to act this way too, and because when it tries to write the
3005          * end-of-recovery checkpoint, it should indeed flush.
3006          */
3007         if (!XLogInsertAllowed())
3008         {
3009                 UpdateMinRecoveryPoint(record, false);
3010                 return;
3011         }
3012
3013         /* Quick exit if already known flushed */
3014         if (record <= LogwrtResult.Flush)
3015                 return;
3016
3017 #ifdef WAL_DEBUG
3018         if (XLOG_DEBUG)
3019                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
3020                          (uint32) (record >> 32), (uint32) record,
3021                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3022                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3023 #endif
3024
3025         START_CRIT_SECTION();
3026
3027         /*
3028          * Since fsync is usually a horribly expensive operation, we try to
3029          * piggyback as much data as we can on each fsync: if we see any more data
3030          * entered into the xlog buffer, we'll write and fsync that too, so that
3031          * the final value of LogwrtResult.Flush is as large as possible. This
3032          * gives us some chance of avoiding another fsync immediately after.
3033          */
3034
3035         /* initialize to given target; may increase below */
3036         WriteRqstPtr = record;
3037
3038         /*
3039          * Now wait until we get the write lock, or someone else does the flush
3040          * for us.
3041          */
3042         for (;;)
3043         {
3044                 /* use volatile pointer to prevent code rearrangement */
3045                 volatile XLogCtlData *xlogctl = XLogCtl;
3046                 XLogRecPtr      insertpos;
3047
3048                 /* read LogwrtResult and update local state */
3049                 SpinLockAcquire(&xlogctl->info_lck);
3050                 if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
3051                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3052                 LogwrtResult = xlogctl->LogwrtResult;
3053                 SpinLockRelease(&xlogctl->info_lck);
3054
3055                 /* done already? */
3056                 if (record <= LogwrtResult.Flush)
3057                         break;
3058
3059                 /*
3060                  * Before actually performing the write, wait for all in-flight
3061                  * insertions to the pages we're about to write to finish.
3062                  */
3063                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
3064
3065                 /*
3066                  * Try to get the write lock. If we can't get it immediately, wait
3067                  * until it's released, and recheck if we still need to do the flush
3068                  * or if the backend that held the lock did it for us already. This
3069                  * helps to maintain a good rate of group committing when the system
3070                  * is bottlenecked by the speed of fsyncing.
3071                  */
3072                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
3073                 {
3074                         /*
3075                          * The lock is now free, but we didn't acquire it yet. Before we
3076                          * do, loop back to check if someone else flushed the record for
3077                          * us already.
3078                          */
3079                         continue;
3080                 }
3081
3082                 /* Got the lock; recheck whether request is satisfied */
3083                 LogwrtResult = XLogCtl->LogwrtResult;
3084                 if (record <= LogwrtResult.Flush)
3085                 {
3086                         LWLockRelease(WALWriteLock);
3087                         break;
3088                 }
3089
3090                 /*
3091                  * Sleep before flush! By adding a delay here, we may give further
3092                  * backends the opportunity to join the backlog of group commit
3093                  * followers; this can significantly improve transaction throughput,
3094                  * at the risk of increasing transaction latency.
3095                  *
3096                  * We do not sleep if enableFsync is not turned on, nor if there are
3097                  * fewer than CommitSiblings other backends with active transactions.
3098                  */
3099                 if (CommitDelay > 0 && enableFsync &&
3100                         MinimumActiveBackends(CommitSiblings))
3101                 {
3102                         pg_usleep(CommitDelay);
3103
3104                         /*
3105                          * Re-check how far we can now flush the WAL. It's generally not
3106                          * safe to call WaitXLogInsetionsToFinish while holding
3107                          * WALWriteLock, because an in-progress insertion might need to
3108                          * also grab WALWriteLock to make progress. But we know that all
3109                          * the insertions up to insertpos have already finished, because
3110                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
3111                          * We're only calling it again to allow insertpos to be moved
3112                          * further forward, not to actually wait for anyone.
3113                          */
3114                         insertpos = WaitXLogInsertionsToFinish(insertpos);
3115                 }
3116
3117                 /* try to write/flush later additions to XLOG as well */
3118                 WriteRqst.Write = insertpos;
3119                 WriteRqst.Flush = insertpos;
3120
3121                 XLogWrite(WriteRqst, false);
3122
3123                 LWLockRelease(WALWriteLock);
3124                 /* done */
3125                 break;
3126         }
3127
3128         END_CRIT_SECTION();
3129
3130         /* wake up walsenders now that we've released heavily contended locks */
3131         WalSndWakeupProcessRequests();
3132
3133         /*
3134          * If we still haven't flushed to the request point then we have a
3135          * problem; most likely, the requested flush point is past end of XLOG.
3136          * This has been seen to occur when a disk page has a corrupted LSN.
3137          *
3138          * Formerly we treated this as a PANIC condition, but that hurts the
3139          * system's robustness rather than helping it: we do not want to take down
3140          * the whole system due to corruption on one data page.  In particular, if
3141          * the bad page is encountered again during recovery then we would be
3142          * unable to restart the database at all!  (This scenario actually
3143          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
3144          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3145          * the only time we can reach here during recovery is while flushing the
3146          * end-of-recovery checkpoint record, and we don't expect that to have a
3147          * bad LSN.
3148          *
3149          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3150          * since xact.c calls this routine inside a critical section.  However,
3151          * calls from bufmgr.c are not within critical sections and so we will not
3152          * force a restart for a bad LSN on a data page.
3153          */
3154         if (LogwrtResult.Flush < record)
3155                 elog(ERROR,
3156                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3157                          (uint32) (record >> 32), (uint32) record,
3158                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3159 }
3160
3161 /*
3162  * Flush xlog, but without specifying exactly where to flush to.
3163  *
3164  * We normally flush only completed blocks; but if there is nothing to do on
3165  * that basis, we check for unflushed async commits in the current incomplete
3166  * block, and flush through the latest one of those.  Thus, if async commits
3167  * are not being used, we will flush complete blocks only.      We can guarantee
3168  * that async commits reach disk after at most three cycles; normally only
3169  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
3170  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
3171  * difference only with very high load or long wal_writer_delay, but imposes
3172  * one extra cycle for the worst case for async commits.)
3173  *
3174  * This routine is invoked periodically by the background walwriter process.
3175  *
3176  * Returns TRUE if we flushed anything.
3177  */
3178 bool
3179 XLogBackgroundFlush(void)
3180 {
3181         XLogRecPtr      WriteRqstPtr;
3182         bool            flexible = true;
3183         bool            wrote_something = false;
3184
3185         /* XLOG doesn't need flushing during recovery */
3186         if (RecoveryInProgress())
3187                 return false;
3188
3189         /* read LogwrtResult and update local state */
3190         {
3191                 /* use volatile pointer to prevent code rearrangement */
3192                 volatile XLogCtlData *xlogctl = XLogCtl;
3193
3194                 SpinLockAcquire(&xlogctl->info_lck);
3195                 LogwrtResult = xlogctl->LogwrtResult;
3196                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3197                 SpinLockRelease(&xlogctl->info_lck);
3198         }
3199
3200         /* back off to last completed page boundary */
3201         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
3202
3203         /* if we have already flushed that far, consider async commit records */
3204         if (WriteRqstPtr <= LogwrtResult.Flush)
3205         {
3206                 /* use volatile pointer to prevent code rearrangement */
3207                 volatile XLogCtlData *xlogctl = XLogCtl;
3208
3209                 SpinLockAcquire(&xlogctl->info_lck);
3210                 WriteRqstPtr = xlogctl->asyncXactLSN;
3211                 SpinLockRelease(&xlogctl->info_lck);
3212                 flexible = false;               /* ensure it all gets written */
3213         }
3214
3215         /*
3216          * If already known flushed, we're done. Just need to check if we are
3217          * holding an open file handle to a logfile that's no longer in use,
3218          * preventing the file from being deleted.
3219          */
3220         if (WriteRqstPtr <= LogwrtResult.Flush)
3221         {
3222                 if (openLogFile >= 0)
3223                 {
3224                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
3225                         {
3226                                 XLogFileClose();
3227                         }
3228                 }
3229                 return false;
3230         }
3231
3232 #ifdef WAL_DEBUG
3233         if (XLOG_DEBUG)
3234                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
3235                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
3236                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3237                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3238 #endif
3239
3240         START_CRIT_SECTION();
3241
3242         /* now wait for any in-progress insertions to finish and get write lock */
3243         WaitXLogInsertionsToFinish(WriteRqstPtr);
3244         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3245         LogwrtResult = XLogCtl->LogwrtResult;
3246         if (WriteRqstPtr > LogwrtResult.Flush)
3247         {
3248                 XLogwrtRqst WriteRqst;
3249
3250                 WriteRqst.Write = WriteRqstPtr;
3251                 WriteRqst.Flush = WriteRqstPtr;
3252                 XLogWrite(WriteRqst, flexible);
3253                 wrote_something = true;
3254         }
3255         LWLockRelease(WALWriteLock);
3256
3257         END_CRIT_SECTION();
3258
3259         /* wake up walsenders now that we've released heavily contended locks */
3260         WalSndWakeupProcessRequests();
3261
3262         /*
3263          * Great, done. To take some work off the critical path, try to initialize
3264          * as many of the no-longer-needed WAL buffers for future use as we can.
3265          */
3266         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3267
3268         return wrote_something;
3269 }
3270
3271 /*
3272  * Test whether XLOG data has been flushed up to (at least) the given position.
3273  *
3274  * Returns true if a flush is still needed.  (It may be that someone else
3275  * is already in process of flushing that far, however.)
3276  */
3277 bool
3278 XLogNeedsFlush(XLogRecPtr record)
3279 {
3280         /*
3281          * During recovery, we don't flush WAL but update minRecoveryPoint
3282          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3283          * would need to be updated.
3284          */
3285         if (RecoveryInProgress())
3286         {
3287                 /* Quick exit if already known updated */
3288                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3289                         return false;
3290
3291                 /*
3292                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3293                  * just return a conservative guess.
3294                  */
3295                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3296                         return true;
3297                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3298                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3299                 LWLockRelease(ControlFileLock);
3300
3301                 /*
3302                  * An invalid minRecoveryPoint means that we need to recover all the
3303                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3304                  * file's value in that case, so we can short-circuit future checks
3305                  * here too.
3306                  */
3307                 if (minRecoveryPoint == 0)
3308                         updateMinRecoveryPoint = false;
3309
3310                 /* check again */
3311                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3312                         return false;
3313                 else
3314                         return true;
3315         }
3316
3317         /* Quick exit if already known flushed */
3318         if (record <= LogwrtResult.Flush)
3319                 return false;
3320
3321         /* read LogwrtResult and update local state */
3322         {
3323                 /* use volatile pointer to prevent code rearrangement */
3324                 volatile XLogCtlData *xlogctl = XLogCtl;
3325
3326                 SpinLockAcquire(&xlogctl->info_lck);
3327                 LogwrtResult = xlogctl->LogwrtResult;
3328                 SpinLockRelease(&xlogctl->info_lck);
3329         }
3330
3331         /* check again */
3332         if (record <= LogwrtResult.Flush)
3333                 return false;
3334
3335         return true;
3336 }
3337
3338 /*
3339  * Create a new XLOG file segment, or open a pre-existing one.
3340  *
3341  * log, seg: identify segment to be created/opened.
3342  *
3343  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3344  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
3345  * file was used.
3346  *
3347  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3348  * place.  This should be TRUE except during bootstrap log creation.  The
3349  * caller must *not* hold the lock at call.
3350  *
3351  * Returns FD of opened file.
3352  *
3353  * Note: errors here are ERROR not PANIC because we might or might not be
3354  * inside a critical section (eg, during checkpoint there is no reason to
3355  * take down the system on failure).  They will promote to PANIC if we are
3356  * in a critical section.
3357  */
3358 int
3359 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3360 {
3361         char            path[MAXPGPATH];
3362         char            tmppath[MAXPGPATH];
3363         char       *zbuffer;
3364         XLogSegNo       installed_segno;
3365         int                     max_advance;
3366         int                     fd;
3367         int                     nbytes;
3368
3369         XLogFilePath(path, ThisTimeLineID, logsegno);
3370
3371         /*
3372          * Try to use existent file (checkpoint maker may have created it already)
3373          */
3374         if (*use_existent)
3375         {
3376                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3377                                                    S_IRUSR | S_IWUSR);
3378                 if (fd < 0)
3379                 {
3380                         if (errno != ENOENT)
3381                                 ereport(ERROR,
3382                                                 (errcode_for_file_access(),
3383                                                  errmsg("could not open file \"%s\": %m", path)));
3384                 }
3385                 else
3386                         return fd;
3387         }
3388
3389         /*
3390          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3391          * another process is doing the same thing.  If so, we will end up
3392          * pre-creating an extra log segment.  That seems OK, and better than
3393          * holding the lock throughout this lengthy process.
3394          */
3395         elog(DEBUG2, "creating and filling new WAL file");
3396
3397         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3398
3399         unlink(tmppath);
3400
3401         /*
3402          * Allocate a buffer full of zeros. This is done before opening the file
3403          * so that we don't leak the file descriptor if palloc fails.
3404          *
3405          * Note: palloc zbuffer, instead of just using a local char array, to
3406          * ensure it is reasonably well-aligned; this may save a few cycles
3407          * transferring data to the kernel.
3408          */
3409         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
3410
3411         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3412         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3413                                            S_IRUSR | S_IWUSR);
3414         if (fd < 0)
3415                 ereport(ERROR,
3416                                 (errcode_for_file_access(),
3417                                  errmsg("could not create file \"%s\": %m", tmppath)));
3418
3419         /*
3420          * Zero-fill the file.  We have to do this the hard way to ensure that all
3421          * the file space has really been allocated --- on platforms that allow
3422          * "holes" in files, just seeking to the end doesn't allocate intermediate
3423          * space.  This way, we know that we have all the space and (after the
3424          * fsync below) that all the indirect blocks are down on disk.  Therefore,
3425          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3426          * log file.
3427          */
3428         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3429         {
3430                 errno = 0;
3431                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3432                 {
3433                         int                     save_errno = errno;
3434
3435                         /*
3436                          * If we fail to make the file, delete it to release disk space
3437                          */
3438                         unlink(tmppath);
3439
3440                         close(fd);
3441
3442                         /* if write didn't set errno, assume problem is no disk space */
3443                         errno = save_errno ? save_errno : ENOSPC;
3444
3445                         ereport(ERROR,
3446                                         (errcode_for_file_access(),
3447                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3448                 }
3449         }
3450         pfree(zbuffer);
3451
3452         if (pg_fsync(fd) != 0)
3453         {
3454                 close(fd);
3455                 ereport(ERROR,
3456                                 (errcode_for_file_access(),
3457                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3458         }
3459
3460         if (close(fd))
3461                 ereport(ERROR,
3462                                 (errcode_for_file_access(),
3463                                  errmsg("could not close file \"%s\": %m", tmppath)));
3464
3465         /*
3466          * Now move the segment into place with its final name.
3467          *
3468          * If caller didn't want to use a pre-existing file, get rid of any
3469          * pre-existing file.  Otherwise, cope with possibility that someone else
3470          * has created the file while we were filling ours: if so, use ours to
3471          * pre-create a future log segment.
3472          */
3473         installed_segno = logsegno;
3474         max_advance = XLOGfileslop;
3475         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3476                                                                 *use_existent, &max_advance,
3477                                                                 use_lock))
3478         {
3479                 /*
3480                  * No need for any more future segments, or InstallXLogFileSegment()
3481                  * failed to rename the file into place. If the rename failed, opening
3482                  * the file below will fail.
3483                  */
3484                 unlink(tmppath);
3485         }
3486
3487         /* Set flag to tell caller there was no existent file */
3488         *use_existent = false;
3489
3490         /* Now open original target segment (might not be file I just made) */
3491         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3492                                            S_IRUSR | S_IWUSR);
3493         if (fd < 0)
3494                 ereport(ERROR,
3495                                 (errcode_for_file_access(),
3496                                  errmsg("could not open file \"%s\": %m", path)));
3497
3498         elog(DEBUG2, "done creating and filling new WAL file");
3499
3500         return fd;
3501 }
3502
3503 /*
3504  * Create a new XLOG file segment by copying a pre-existing one.
3505  *
3506  * destsegno: identify segment to be created.
3507  *
3508  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
3509  *              a different timeline)
3510  *
3511  * Currently this is only used during recovery, and so there are no locking
3512  * considerations.      But we should be just as tense as XLogFileInit to avoid
3513  * emplacing a bogus file.
3514  */
3515 static void
3516 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
3517 {
3518         char            path[MAXPGPATH];
3519         char            tmppath[MAXPGPATH];
3520         char            buffer[XLOG_BLCKSZ];
3521         int                     srcfd;
3522         int                     fd;
3523         int                     nbytes;
3524
3525         /*
3526          * Open the source file
3527          */
3528         XLogFilePath(path, srcTLI, srcsegno);
3529         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3530         if (srcfd < 0)
3531                 ereport(ERROR,
3532                                 (errcode_for_file_access(),
3533                                  errmsg("could not open file \"%s\": %m", path)));
3534
3535         /*
3536          * Copy into a temp file name.
3537          */
3538         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3539
3540         unlink(tmppath);
3541
3542         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3543         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3544                                                    S_IRUSR | S_IWUSR);
3545         if (fd < 0)
3546                 ereport(ERROR,
3547                                 (errcode_for_file_access(),
3548                                  errmsg("could not create file \"%s\": %m", tmppath)));
3549
3550         /*
3551          * Do the data copying.
3552          */
3553         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3554         {
3555                 errno = 0;
3556                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3557                 {
3558                         if (errno != 0)
3559                                 ereport(ERROR,
3560                                                 (errcode_for_file_access(),
3561                                                  errmsg("could not read file \"%s\": %m", path)));
3562                         else
3563                                 ereport(ERROR,
3564                                                 (errmsg("not enough data in file \"%s\"", path)));
3565                 }
3566                 errno = 0;
3567                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3568                 {
3569                         int                     save_errno = errno;
3570
3571                         /*
3572                          * If we fail to make the file, delete it to release disk space
3573                          */
3574                         unlink(tmppath);
3575                         /* if write didn't set errno, assume problem is no disk space */
3576                         errno = save_errno ? save_errno : ENOSPC;
3577
3578                         ereport(ERROR,
3579                                         (errcode_for_file_access(),
3580                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3581                 }
3582         }
3583
3584         if (pg_fsync(fd) != 0)
3585                 ereport(ERROR,
3586                                 (errcode_for_file_access(),
3587                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3588
3589         if (CloseTransientFile(fd))
3590                 ereport(ERROR,
3591                                 (errcode_for_file_access(),
3592                                  errmsg("could not close file \"%s\": %m", tmppath)));
3593
3594         CloseTransientFile(srcfd);
3595
3596         /*
3597          * Now move the segment into place with its final name.
3598          */
3599         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
3600                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3601 }
3602
3603 /*
3604  * Install a new XLOG segment file as a current or future log segment.
3605  *
3606  * This is used both to install a newly-created segment (which has a temp
3607  * filename while it's being created) and to recycle an old segment.
3608  *
3609  * *segno: identify segment to install as (or first possible target).
3610  * When find_free is TRUE, this is modified on return to indicate the
3611  * actual installation location or last segment searched.
3612  *
3613  * tmppath: initial name of file to install.  It will be renamed into place.
3614  *
3615  * find_free: if TRUE, install the new segment at the first empty segno
3616  * number at or after the passed numbers.  If FALSE, install the new segment
3617  * exactly where specified, deleting any existing segment file there.
3618  *
3619  * *max_advance: maximum number of segno slots to advance past the starting
3620  * point.  Fail if no free slot is found in this range.  On return, reduced
3621  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
3622  * when find_free is FALSE.)
3623  *
3624  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3625  * place.  This should be TRUE except during bootstrap log creation.  The
3626  * caller must *not* hold the lock at call.
3627  *
3628  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3629  * max_advance limit was exceeded, or an error occurred while renaming the
3630  * file into place.
3631  */
3632 static bool
3633 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3634                                            bool find_free, int *max_advance,
3635                                            bool use_lock)
3636 {
3637         char            path[MAXPGPATH];
3638         struct stat stat_buf;
3639
3640         XLogFilePath(path, ThisTimeLineID, *segno);
3641
3642         /*
3643          * We want to be sure that only one process does this at a time.
3644          */
3645         if (use_lock)
3646                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3647
3648         if (!find_free)
3649         {
3650                 /* Force installation: get rid of any pre-existing segment file */
3651                 unlink(path);
3652         }
3653         else
3654         {
3655                 /* Find a free slot to put it in */
3656                 while (stat(path, &stat_buf) == 0)
3657                 {
3658                         if (*max_advance <= 0)
3659                         {
3660                                 /* Failed to find a free slot within specified range */
3661                                 if (use_lock)
3662                                         LWLockRelease(ControlFileLock);
3663                                 return false;
3664                         }
3665                         (*segno)++;
3666                         (*max_advance)--;
3667                         XLogFilePath(path, ThisTimeLineID, *segno);
3668                 }
3669         }
3670
3671         /*
3672          * Prefer link() to rename() here just to be really sure that we don't
3673          * overwrite an existing logfile.  However, there shouldn't be one, so
3674          * rename() is an acceptable substitute except for the truly paranoid.
3675          */
3676 #if HAVE_WORKING_LINK
3677         if (link(tmppath, path) < 0)
3678         {
3679                 if (use_lock)
3680                         LWLockRelease(ControlFileLock);
3681                 ereport(LOG,
3682                                 (errcode_for_file_access(),
3683                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
3684                                                 tmppath, path)));
3685                 return false;
3686         }
3687         unlink(tmppath);
3688 #else
3689         if (rename(tmppath, path) < 0)
3690         {
3691                 if (use_lock)
3692                         LWLockRelease(ControlFileLock);
3693                 ereport(LOG,
3694                                 (errcode_for_file_access(),
3695                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
3696                                                 tmppath, path)));
3697                 return false;
3698         }
3699 #endif
3700
3701         if (use_lock)
3702                 LWLockRelease(ControlFileLock);
3703
3704         return true;
3705 }
3706
3707 /*
3708  * Open a pre-existing logfile segment for writing.
3709  */
3710 int
3711 XLogFileOpen(XLogSegNo segno)
3712 {
3713         char            path[MAXPGPATH];
3714         int                     fd;
3715
3716         XLogFilePath(path, ThisTimeLineID, segno);
3717
3718         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3719                                            S_IRUSR | S_IWUSR);
3720         if (fd < 0)
3721                 ereport(PANIC,
3722                                 (errcode_for_file_access(),
3723                                  errmsg("could not open transaction log file \"%s\": %m", path)));
3724
3725         return fd;
3726 }
3727
3728 /*
3729  * Open a logfile segment for reading (during recovery).
3730  *
3731  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3732  * Otherwise, it's assumed to be already available in pg_xlog.
3733  */
3734 static int
3735 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3736                          int source, bool notfoundOk)
3737 {
3738         char            xlogfname[MAXFNAMELEN];
3739         char            activitymsg[MAXFNAMELEN + 16];
3740         char            path[MAXPGPATH];
3741         int                     fd;
3742
3743         XLogFileName(xlogfname, tli, segno);
3744
3745         switch (source)
3746         {
3747                 case XLOG_FROM_ARCHIVE:
3748                         /* Report recovery progress in PS display */
3749                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3750                                          xlogfname);
3751                         set_ps_display(activitymsg, false);
3752
3753                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3754                                                                                                           "RECOVERYXLOG",
3755                                                                                                           XLogSegSize,
3756                                                                                                           InRedo);
3757                         if (!restoredFromArchive)
3758                                 return -1;
3759                         break;
3760
3761                 case XLOG_FROM_PG_XLOG:
3762                 case XLOG_FROM_STREAM:
3763                         XLogFilePath(path, tli, segno);
3764                         restoredFromArchive = false;
3765                         break;
3766
3767                 default:
3768                         elog(ERROR, "invalid XLogFileRead source %d", source);
3769         }
3770
3771         /*
3772          * If the segment was fetched from archival storage, replace the existing
3773          * xlog segment (if any) with the archival version.
3774          */
3775         if (source == XLOG_FROM_ARCHIVE)
3776         {
3777                 KeepFileRestoredFromArchive(path, xlogfname);
3778
3779                 /*
3780                  * Set path to point at the new file in pg_xlog.
3781                  */
3782                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3783         }
3784
3785         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3786         if (fd >= 0)
3787         {
3788                 /* Success! */
3789                 curFileTLI = tli;
3790
3791                 /* Report recovery progress in PS display */
3792                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3793                                  xlogfname);
3794                 set_ps_display(activitymsg, false);
3795
3796                 /* Track source of data in assorted state variables */
3797                 readSource = source;
3798                 XLogReceiptSource = source;
3799                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3800                 if (source != XLOG_FROM_STREAM)
3801                         XLogReceiptTime = GetCurrentTimestamp();
3802
3803                 return fd;
3804         }
3805         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3806                 ereport(PANIC,
3807                                 (errcode_for_file_access(),
3808                                  errmsg("could not open file \"%s\": %m", path)));
3809         return -1;
3810 }
3811
3812 /*
3813  * Open a logfile segment for reading (during recovery).
3814  *
3815  * This version searches for the segment with any TLI listed in expectedTLEs.
3816  */
3817 static int
3818 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3819 {
3820         char            path[MAXPGPATH];
3821         ListCell   *cell;
3822         int                     fd;
3823         List       *tles;
3824
3825         /*
3826          * Loop looking for a suitable timeline ID: we might need to read any of
3827          * the timelines listed in expectedTLEs.
3828          *
3829          * We expect curFileTLI on entry to be the TLI of the preceding file in
3830          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3831          * to go backwards; this prevents us from picking up the wrong file when a
3832          * parent timeline extends to higher segment numbers than the child we
3833          * want to read.
3834          *
3835          * If we haven't read the timeline history file yet, read it now, so that
3836          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3837          * however, unless we actually find a valid segment.  That way if there is
3838          * neither a timeline history file nor a WAL segment in the archive, and
3839          * streaming replication is set up, we'll read the timeline history file
3840          * streamed from the master when we start streaming, instead of recovering
3841          * with a dummy history generated here.
3842          */
3843         if (expectedTLEs)
3844                 tles = expectedTLEs;
3845         else
3846                 tles = readTimeLineHistory(recoveryTargetTLI);
3847
3848         foreach(cell, tles)
3849         {
3850                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3851
3852                 if (tli < curFileTLI)
3853                         break;                          /* don't bother looking at too-old TLIs */
3854
3855                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3856                 {
3857                         fd = XLogFileRead(segno, emode, tli,
3858                                                           XLOG_FROM_ARCHIVE, true);
3859                         if (fd != -1)
3860                         {
3861                                 elog(DEBUG1, "got WAL segment from archive");
3862                                 if (!expectedTLEs)
3863                                         expectedTLEs = tles;
3864                                 return fd;
3865                         }
3866                 }
3867
3868                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3869                 {
3870                         fd = XLogFileRead(segno, emode, tli,
3871                                                           XLOG_FROM_PG_XLOG, true);
3872                         if (fd != -1)
3873                         {
3874                                 if (!expectedTLEs)
3875                                         expectedTLEs = tles;
3876                                 return fd;
3877                         }
3878                 }
3879         }
3880
3881         /* Couldn't find it.  For simplicity, complain about front timeline */
3882         XLogFilePath(path, recoveryTargetTLI, segno);
3883         errno = ENOENT;
3884         ereport(emode,
3885                         (errcode_for_file_access(),
3886                          errmsg("could not open file \"%s\": %m", path)));
3887         return -1;
3888 }
3889
3890 /*
3891  * Close the current logfile segment for writing.
3892  */
3893 static void
3894 XLogFileClose(void)
3895 {
3896         Assert(openLogFile >= 0);
3897
3898         /*
3899          * WAL segment files will not be re-read in normal operation, so we advise
3900          * the OS to release any cached pages.  But do not do so if WAL archiving
3901          * or streaming is active, because archiver and walsender process could
3902          * use the cache to read the WAL segment.
3903          */
3904 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3905         if (!XLogIsNeeded())
3906                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3907 #endif
3908
3909         if (close(openLogFile))
3910                 ereport(PANIC,
3911                                 (errcode_for_file_access(),
3912                                  errmsg("could not close log file %s: %m",
3913                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3914         openLogFile = -1;
3915 }
3916
3917 /*
3918  * Preallocate log files beyond the specified log endpoint.
3919  *
3920  * XXX this is currently extremely conservative, since it forces only one
3921  * future log segment to exist, and even that only if we are 75% done with
3922  * the current one.  This is only appropriate for very low-WAL-volume systems.
3923  * High-volume systems will be OK once they've built up a sufficient set of
3924  * recycled log segments, but the startup transient is likely to include
3925  * a lot of segment creations by foreground processes, which is not so good.
3926  */
3927 static void
3928 PreallocXlogFiles(XLogRecPtr endptr)
3929 {
3930         XLogSegNo       _logSegNo;
3931         int                     lf;
3932         bool            use_existent;
3933
3934         XLByteToPrevSeg(endptr, _logSegNo);
3935         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3936         {
3937                 _logSegNo++;
3938                 use_existent = true;
3939                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3940                 close(lf);
3941                 if (!use_existent)
3942                         CheckpointStats.ckpt_segs_added++;
3943         }
3944 }
3945
3946 /*
3947  * Throws an error if the given log segment has already been removed or
3948  * recycled. The caller should only pass a segment that it knows to have
3949  * existed while the server has been running, as this function always
3950  * succeeds if no WAL segments have been removed since startup.
3951  * 'tli' is only used in the error message.
3952  */
3953 void
3954 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3955 {
3956         /* use volatile pointer to prevent code rearrangement */
3957         volatile XLogCtlData *xlogctl = XLogCtl;
3958         XLogSegNo       lastRemovedSegNo;
3959
3960         SpinLockAcquire(&xlogctl->info_lck);
3961         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
3962         SpinLockRelease(&xlogctl->info_lck);
3963
3964         if (segno <= lastRemovedSegNo)
3965         {
3966                 char            filename[MAXFNAMELEN];
3967
3968                 XLogFileName(filename, tli, segno);
3969                 ereport(ERROR,
3970                                 (errcode_for_file_access(),
3971                                  errmsg("requested WAL segment %s has already been removed",
3972                                                 filename)));
3973         }
3974 }
3975
3976 /*
3977  * Update the last removed segno pointer in shared memory, to reflect
3978  * that the given XLOG file has been removed.
3979  */
3980 static void
3981 UpdateLastRemovedPtr(char *filename)
3982 {
3983         /* use volatile pointer to prevent code rearrangement */
3984         volatile XLogCtlData *xlogctl = XLogCtl;
3985         uint32          tli;
3986         XLogSegNo       segno;
3987
3988         XLogFromFileName(filename, &tli, &segno);
3989
3990         SpinLockAcquire(&xlogctl->info_lck);
3991         if (segno > xlogctl->lastRemovedSegNo)
3992                 xlogctl->lastRemovedSegNo = segno;
3993         SpinLockRelease(&xlogctl->info_lck);
3994 }
3995
3996 /*
3997  * Recycle or remove all log files older or equal to passed segno
3998  *
3999  * endptr is current (or recent) end of xlog; this is used to determine
4000  * whether we want to recycle rather than delete no-longer-wanted log files.
4001  */
4002 static void
4003 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
4004 {
4005         XLogSegNo       endlogSegNo;
4006         int                     max_advance;
4007         DIR                *xldir;
4008         struct dirent *xlde;
4009         char            lastoff[MAXFNAMELEN];
4010         char            path[MAXPGPATH];
4011
4012 #ifdef WIN32
4013         char            newpath[MAXPGPATH];
4014 #endif
4015         struct stat statbuf;
4016
4017         /*
4018          * Initialize info about where to try to recycle to.  We allow recycling
4019          * segments up to XLOGfileslop segments beyond the current XLOG location.
4020          */
4021         XLByteToPrevSeg(endptr, endlogSegNo);
4022         max_advance = XLOGfileslop;
4023
4024         xldir = AllocateDir(XLOGDIR);
4025         if (xldir == NULL)
4026                 ereport(ERROR,
4027                                 (errcode_for_file_access(),
4028                                  errmsg("could not open transaction log directory \"%s\": %m",
4029                                                 XLOGDIR)));
4030
4031         /*
4032          * Construct a filename of the last segment to be kept. The timeline ID
4033          * doesn't matter, we ignore that in the comparison. (During recovery,
4034          * ThisTimeLineID isn't set, so we can't use that.)
4035          */
4036         XLogFileName(lastoff, 0, segno);
4037
4038         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4039                  lastoff);
4040
4041         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4042         {
4043                 /*
4044                  * We ignore the timeline part of the XLOG segment identifiers in
4045                  * deciding whether a segment is still needed.  This ensures that we
4046                  * won't prematurely remove a segment from a parent timeline. We could
4047                  * probably be a little more proactive about removing segments of
4048                  * non-parent timelines, but that would be a whole lot more
4049                  * complicated.
4050                  *
4051                  * We use the alphanumeric sorting property of the filenames to decide
4052                  * which ones are earlier than the lastoff segment.
4053                  */
4054                 if (strlen(xlde->d_name) == 24 &&
4055                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4056                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4057                 {
4058                         if (XLogArchiveCheckDone(xlde->d_name))
4059                         {
4060                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4061
4062                                 /* Update the last removed location in shared memory first */
4063                                 UpdateLastRemovedPtr(xlde->d_name);
4064
4065                                 /*
4066                                  * Before deleting the file, see if it can be recycled as a
4067                                  * future log segment. Only recycle normal files, pg_standby
4068                                  * for example can create symbolic links pointing to a
4069                                  * separate archive directory.
4070                                  */
4071                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4072                                         InstallXLogFileSegment(&endlogSegNo, path,
4073                                                                                    true, &max_advance, true))
4074                                 {
4075                                         ereport(DEBUG2,
4076                                                         (errmsg("recycled transaction log file \"%s\"",
4077                                                                         xlde->d_name)));
4078                                         CheckpointStats.ckpt_segs_recycled++;
4079                                         /* Needn't recheck that slot on future iterations */
4080                                         if (max_advance > 0)
4081                                         {
4082                                                 endlogSegNo++;
4083                                                 max_advance--;
4084                                         }
4085                                 }
4086                                 else
4087                                 {
4088                                         /* No need for any more future segments... */
4089                                         int                     rc;
4090
4091                                         ereport(DEBUG2,
4092                                                         (errmsg("removing transaction log file \"%s\"",
4093                                                                         xlde->d_name)));
4094
4095 #ifdef WIN32
4096
4097                                         /*
4098                                          * On Windows, if another process (e.g another backend)
4099                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
4100                                          * will succeed, but the file will still show up in
4101                                          * directory listing until the last handle is closed. To
4102                                          * avoid confusing the lingering deleted file for a live
4103                                          * WAL file that needs to be archived, rename it before
4104                                          * deleting it.
4105                                          *
4106                                          * If another process holds the file open without
4107                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
4108                                          * again at the next checkpoint.
4109                                          */
4110                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4111                                         if (rename(path, newpath) != 0)
4112                                         {
4113                                                 ereport(LOG,
4114                                                                 (errcode_for_file_access(),
4115                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
4116                                                                                 path)));
4117                                                 continue;
4118                                         }
4119                                         rc = unlink(newpath);
4120 #else
4121                                         rc = unlink(path);
4122 #endif
4123                                         if (rc != 0)
4124                                         {
4125                                                 ereport(LOG,
4126                                                                 (errcode_for_file_access(),
4127                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
4128                                                                                 path)));
4129                                                 continue;
4130                                         }
4131                                         CheckpointStats.ckpt_segs_removed++;
4132                                 }
4133
4134                                 XLogArchiveCleanup(xlde->d_name);
4135                         }
4136                 }
4137         }
4138
4139         FreeDir(xldir);
4140 }
4141
4142 /*
4143  * Verify whether pg_xlog and pg_xlog/archive_status exist.
4144  * If the latter does not exist, recreate it.
4145  *
4146  * It is not the goal of this function to verify the contents of these
4147  * directories, but to help in cases where someone has performed a cluster
4148  * copy for PITR purposes but omitted pg_xlog from the copy.
4149  *
4150  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
4151  * policy decision was made not to.  It is fairly common for pg_xlog to be
4152  * a symlink, and if that was the DBA's intent then automatically making a
4153  * plain directory would result in degraded performance with no notice.
4154  */
4155 static void
4156 ValidateXLOGDirectoryStructure(void)
4157 {
4158         char            path[MAXPGPATH];
4159         struct stat stat_buf;
4160
4161         /* Check for pg_xlog; if it doesn't exist, error out */
4162         if (stat(XLOGDIR, &stat_buf) != 0 ||
4163                 !S_ISDIR(stat_buf.st_mode))
4164                 ereport(FATAL,
4165                                 (errmsg("required WAL directory \"%s\" does not exist",
4166                                                 XLOGDIR)));
4167
4168         /* Check for archive_status */
4169         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4170         if (stat(path, &stat_buf) == 0)
4171         {
4172                 /* Check for weird cases where it exists but isn't a directory */
4173                 if (!S_ISDIR(stat_buf.st_mode))
4174                         ereport(FATAL,
4175                                         (errmsg("required WAL directory \"%s\" does not exist",
4176                                                         path)));
4177         }
4178         else
4179         {
4180                 ereport(LOG,
4181                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4182                 if (mkdir(path, S_IRWXU) < 0)
4183                         ereport(FATAL,
4184                                         (errmsg("could not create missing directory \"%s\": %m",
4185                                                         path)));
4186         }
4187 }
4188
4189 /*
4190  * Remove previous backup history files.  This also retries creation of
4191  * .ready files for any backup history files for which XLogArchiveNotify
4192  * failed earlier.
4193  */
4194 static void
4195 CleanupBackupHistory(void)
4196 {
4197         DIR                *xldir;
4198         struct dirent *xlde;
4199         char            path[MAXPGPATH];
4200
4201         xldir = AllocateDir(XLOGDIR);
4202         if (xldir == NULL)
4203                 ereport(ERROR,
4204                                 (errcode_for_file_access(),
4205                                  errmsg("could not open transaction log directory \"%s\": %m",
4206                                                 XLOGDIR)));
4207
4208         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4209         {
4210                 if (strlen(xlde->d_name) > 24 &&
4211                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4212                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
4213                                    ".backup") == 0)
4214                 {
4215                         if (XLogArchiveCheckDone(xlde->d_name))
4216                         {
4217                                 ereport(DEBUG2,
4218                                 (errmsg("removing transaction log backup history file \"%s\"",
4219                                                 xlde->d_name)));
4220                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4221                                 unlink(path);
4222                                 XLogArchiveCleanup(xlde->d_name);
4223                         }
4224                 }
4225         }
4226
4227         FreeDir(xldir);
4228 }
4229
4230 /*
4231  * Restore a full-page image from a backup block attached to an XLOG record.
4232  *
4233  * lsn: LSN of the XLOG record being replayed
4234  * record: the complete XLOG record
4235  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
4236  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
4237  * keep_buffer: TRUE to return the buffer still locked and pinned
4238  *
4239  * Returns the buffer number containing the page.  Note this is not terribly
4240  * useful unless keep_buffer is specified as TRUE.
4241  *
4242  * Note: when a backup block is available in XLOG, we restore it
4243  * unconditionally, even if the page in the database appears newer.
4244  * This is to protect ourselves against database pages that were partially
4245  * or incorrectly written during a crash.  We assume that the XLOG data
4246  * must be good because it has passed a CRC check, while the database
4247  * page might not be.  This will force us to replay all subsequent
4248  * modifications of the page that appear in XLOG, rather than possibly
4249  * ignoring them as already applied, but that's not a huge drawback.
4250  *
4251  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
4252  * else a normal exclusive lock is used.  During crash recovery, that's just
4253  * pro forma because there can't be any regular backends in the system, but
4254  * in hot standby mode the distinction is important.
4255  *
4256  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
4257  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
4258  * is needed in some cases when replaying XLOG records that touch multiple
4259  * pages, to prevent inconsistent states from being visible to other backends.
4260  * (Again, that's only important in hot standby mode.)
4261  */
4262 Buffer
4263 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
4264                                    bool get_cleanup_lock, bool keep_buffer)
4265 {
4266         BkpBlock        bkpb;
4267         char       *blk;
4268         int                     i;
4269
4270         /* Locate requested BkpBlock in the record */
4271         blk = (char *) XLogRecGetData(record) + record->xl_len;
4272         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
4273         {
4274                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
4275                         continue;
4276
4277                 memcpy(&bkpb, blk, sizeof(BkpBlock));
4278                 blk += sizeof(BkpBlock);
4279
4280                 if (i == block_index)
4281                 {
4282                         /* Found it, apply the update */
4283                         return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
4284                                                                                           keep_buffer);
4285                 }
4286
4287                 blk += BLCKSZ - bkpb.hole_length;
4288         }
4289
4290         /* Caller specified a bogus block_index */
4291         elog(ERROR, "failed to restore block_index %d", block_index);
4292         return InvalidBuffer;           /* keep compiler quiet */
4293 }
4294
4295 /*
4296  * Workhorse for RestoreBackupBlock usable without an xlog record
4297  *
4298  * Restores a full-page image from BkpBlock and a data pointer.
4299  */
4300 static Buffer
4301 RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
4302                                                    bool get_cleanup_lock, bool keep_buffer)
4303 {
4304         Buffer          buffer;
4305         Page            page;
4306
4307         buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
4308                                                                         RBM_ZERO);
4309         Assert(BufferIsValid(buffer));
4310         if (get_cleanup_lock)
4311                 LockBufferForCleanup(buffer);
4312         else
4313                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4314
4315         page = (Page) BufferGetPage(buffer);
4316
4317         if (bkpb.hole_length == 0)
4318         {
4319                 memcpy((char *) page, blk, BLCKSZ);
4320         }
4321         else
4322         {
4323                 memcpy((char *) page, blk, bkpb.hole_offset);
4324                 /* must zero-fill the hole */
4325                 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
4326                 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
4327                            blk + bkpb.hole_offset,
4328                            BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
4329         }
4330
4331         /*
4332          * The checksum value on this page is currently invalid. We don't need to
4333          * reset it here since it will be set before being written.
4334          */
4335
4336         PageSetLSN(page, lsn);
4337         MarkBufferDirty(buffer);
4338
4339         if (!keep_buffer)
4340                 UnlockReleaseBuffer(buffer);
4341
4342         return buffer;
4343 }
4344
4345 /*
4346  * Attempt to read an XLOG record.
4347  *
4348  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
4349  * try to read a record just after the last one previously read.
4350  *
4351  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4352  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4353  * record is available.
4354  *
4355  * The record is copied into readRecordBuf, so that on successful return,
4356  * the returned record pointer always points there.
4357  */
4358 static XLogRecord *
4359 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4360                    bool fetching_ckpt)
4361 {
4362         XLogRecord *record;
4363         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4364
4365         /* Pass through parameters to XLogPageRead */
4366         private->fetching_ckpt = fetching_ckpt;
4367         private->emode = emode;
4368         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4369
4370         /* This is the first attempt to read this page. */
4371         lastSourceFailed = false;
4372
4373         for (;;)
4374         {
4375                 char       *errormsg;
4376
4377                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4378                 ReadRecPtr = xlogreader->ReadRecPtr;
4379                 EndRecPtr = xlogreader->EndRecPtr;
4380                 if (record == NULL)
4381                 {
4382                         if (readFile >= 0)
4383                         {
4384                                 close(readFile);
4385                                 readFile = -1;
4386                         }
4387
4388                         /*
4389                          * We only end up here without a message when XLogPageRead()
4390                          * failed - in that case we already logged something. In
4391                          * StandbyMode that only happens if we have been triggered, so we
4392                          * shouldn't loop anymore in that case.
4393                          */
4394                         if (errormsg)
4395                                 ereport(emode_for_corrupt_record(emode,
4396                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4397                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4398                 }
4399
4400                 /*
4401                  * Check page TLI is one of the expected values.
4402                  */
4403                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4404                 {
4405                         char            fname[MAXFNAMELEN];
4406                         XLogSegNo       segno;
4407                         int32           offset;
4408
4409                         XLByteToSeg(xlogreader->latestPagePtr, segno);
4410                         offset = xlogreader->latestPagePtr % XLogSegSize;
4411                         XLogFileName(fname, xlogreader->readPageTLI, segno);
4412                         ereport(emode_for_corrupt_record(emode,
4413                                                                                          RecPtr ? RecPtr : EndRecPtr),
4414                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4415                                         xlogreader->latestPageTLI,
4416                                         fname,
4417                                         offset)));
4418                         record = NULL;
4419                 }
4420
4421                 if (record)
4422                 {
4423                         /* Great, got a record */
4424                         return record;
4425                 }
4426                 else
4427                 {
4428                         /* No valid record available from this source */
4429                         lastSourceFailed = true;
4430
4431                         /*
4432                          * If archive recovery was requested, but we were still doing
4433                          * crash recovery, switch to archive recovery and retry using the
4434                          * offline archive. We have now replayed all the valid WAL in
4435                          * pg_xlog, so we are presumably now consistent.
4436                          *
4437                          * We require that there's at least some valid WAL present in
4438                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
4439                          * from the archive, even if pg_xlog is completely empty, but we'd
4440                          * have no idea how far we'd have to replay to reach consistency.
4441                          * So err on the safe side and give up.
4442                          */
4443                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4444                                 !fetching_ckpt)
4445                         {
4446                                 ereport(DEBUG1,
4447                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
4448                                 InArchiveRecovery = true;
4449                                 if (StandbyModeRequested)
4450                                         StandbyMode = true;
4451
4452                                 /* initialize minRecoveryPoint to this record */
4453                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4454                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4455                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4456                                 {
4457                                         ControlFile->minRecoveryPoint = EndRecPtr;
4458                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4459                                 }
4460                                 /* update local copy */
4461                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4462                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4463
4464                                 UpdateControlFile();
4465                                 LWLockRelease(ControlFileLock);
4466
4467                                 CheckRecoveryConsistency();
4468
4469                                 /*
4470                                  * Before we retry, reset lastSourceFailed and currentSource
4471                                  * so that we will check the archive next.
4472                                  */
4473                                 lastSourceFailed = false;
4474                                 currentSource = 0;
4475
4476                                 continue;
4477                         }
4478
4479                         /* In standby mode, loop back to retry. Otherwise, give up. */
4480                         if (StandbyMode && !CheckForStandbyTrigger())
4481                                 continue;
4482                         else
4483                                 return NULL;
4484                 }
4485         }
4486 }
4487
4488 /*
4489  * Scan for new timelines that might have appeared in the archive since we
4490  * started recovery.
4491  *
4492  * If there are any, the function changes recovery target TLI to the latest
4493  * one and returns 'true'.
4494  */
4495 static bool
4496 rescanLatestTimeLine(void)
4497 {
4498         List       *newExpectedTLEs;
4499         bool            found;
4500         ListCell   *cell;
4501         TimeLineID      newtarget;
4502         TimeLineID      oldtarget = recoveryTargetTLI;
4503         TimeLineHistoryEntry *currentTle = NULL;
4504
4505         newtarget = findNewestTimeLine(recoveryTargetTLI);
4506         if (newtarget == recoveryTargetTLI)
4507         {
4508                 /* No new timelines found */
4509                 return false;
4510         }
4511
4512         /*
4513          * Determine the list of expected TLIs for the new TLI
4514          */
4515
4516         newExpectedTLEs = readTimeLineHistory(newtarget);
4517
4518         /*
4519          * If the current timeline is not part of the history of the new timeline,
4520          * we cannot proceed to it.
4521          */
4522         found = false;
4523         foreach(cell, newExpectedTLEs)
4524         {
4525                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4526
4527                 if (currentTle->tli == recoveryTargetTLI)
4528                 {
4529                         found = true;
4530                         break;
4531                 }
4532         }
4533         if (!found)
4534         {
4535                 ereport(LOG,
4536                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4537                                                 newtarget,
4538                                                 ThisTimeLineID)));
4539                 return false;
4540         }
4541
4542         /*
4543          * The current timeline was found in the history file, but check that the
4544          * next timeline was forked off from it *after* the current recovery
4545          * location.
4546          */
4547         if (currentTle->end < EndRecPtr)
4548         {
4549                 ereport(LOG,
4550                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4551                                                 newtarget,
4552                                                 ThisTimeLineID,
4553                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4554                 return false;
4555         }
4556
4557         /* The new timeline history seems valid. Switch target */
4558         recoveryTargetTLI = newtarget;
4559         list_free_deep(expectedTLEs);
4560         expectedTLEs = newExpectedTLEs;
4561
4562         /*
4563          * As in StartupXLOG(), try to ensure we have all the history files
4564          * between the old target and new target in pg_xlog.
4565          */
4566         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4567
4568         ereport(LOG,
4569                         (errmsg("new target timeline is %u",
4570                                         recoveryTargetTLI)));
4571
4572         return true;
4573 }
4574
4575 /*
4576  * I/O routines for pg_control
4577  *
4578  * *ControlFile is a buffer in shared memory that holds an image of the
4579  * contents of pg_control.      WriteControlFile() initializes pg_control
4580  * given a preloaded buffer, ReadControlFile() loads the buffer from
4581  * the pg_control file (during postmaster or standalone-backend startup),
4582  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4583  *
4584  * For simplicity, WriteControlFile() initializes the fields of pg_control
4585  * that are related to checking backend/database compatibility, and
4586  * ReadControlFile() verifies they are correct.  We could split out the
4587  * I/O and compatibility-check functions, but there seems no need currently.
4588  */
4589 static void
4590 WriteControlFile(void)
4591 {
4592         int                     fd;
4593         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4594
4595         /*
4596          * Initialize version and compatibility-check fields
4597          */
4598         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4599         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4600
4601         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4602         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4603
4604         ControlFile->blcksz = BLCKSZ;
4605         ControlFile->relseg_size = RELSEG_SIZE;
4606         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4607         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4608
4609         ControlFile->nameDataLen = NAMEDATALEN;
4610         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4611
4612         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4613
4614 #ifdef HAVE_INT64_TIMESTAMP
4615         ControlFile->enableIntTimes = true;
4616 #else
4617         ControlFile->enableIntTimes = false;
4618 #endif
4619         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4620         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4621
4622         /* Contents are protected with a CRC */
4623         INIT_CRC32(ControlFile->crc);
4624         COMP_CRC32(ControlFile->crc,
4625                            (char *) ControlFile,
4626                            offsetof(ControlFileData, crc));
4627         FIN_CRC32(ControlFile->crc);
4628
4629         /*
4630          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4631          * excess over sizeof(ControlFileData).  This reduces the odds of
4632          * premature-EOF errors when reading pg_control.  We'll still fail when we
4633          * check the contents of the file, but hopefully with a more specific
4634          * error than "couldn't read pg_control".
4635          */
4636         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4637                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4638
4639         memset(buffer, 0, PG_CONTROL_SIZE);
4640         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4641
4642         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4643                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4644                                            S_IRUSR | S_IWUSR);
4645         if (fd < 0)
4646                 ereport(PANIC,
4647                                 (errcode_for_file_access(),
4648                                  errmsg("could not create control file \"%s\": %m",
4649                                                 XLOG_CONTROL_FILE)));
4650
4651         errno = 0;
4652         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4653         {
4654                 /* if write didn't set errno, assume problem is no disk space */
4655                 if (errno == 0)
4656                         errno = ENOSPC;
4657                 ereport(PANIC,
4658                                 (errcode_for_file_access(),
4659                                  errmsg("could not write to control file: %m")));
4660         }
4661
4662         if (pg_fsync(fd) != 0)
4663                 ereport(PANIC,
4664                                 (errcode_for_file_access(),
4665                                  errmsg("could not fsync control file: %m")));
4666
4667         if (close(fd))
4668                 ereport(PANIC,
4669                                 (errcode_for_file_access(),
4670                                  errmsg("could not close control file: %m")));
4671 }
4672
4673 static void
4674 ReadControlFile(void)
4675 {
4676         pg_crc32        crc;
4677         int                     fd;
4678
4679         /*
4680          * Read data...
4681          */
4682         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4683                                            O_RDWR | PG_BINARY,
4684                                            S_IRUSR | S_IWUSR);
4685         if (fd < 0)
4686                 ereport(PANIC,
4687                                 (errcode_for_file_access(),
4688                                  errmsg("could not open control file \"%s\": %m",
4689                                                 XLOG_CONTROL_FILE)));
4690
4691         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4692                 ereport(PANIC,
4693                                 (errcode_for_file_access(),
4694                                  errmsg("could not read from control file: %m")));
4695
4696         close(fd);
4697
4698         /*
4699          * Check for expected pg_control format version.  If this is wrong, the
4700          * CRC check will likely fail because we'll be checking the wrong number
4701          * of bytes.  Complaining about wrong version will probably be more
4702          * enlightening than complaining about wrong CRC.
4703          */
4704
4705         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4706                 ereport(FATAL,
4707                                 (errmsg("database files are incompatible with server"),
4708                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4709                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4710                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4711                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4712                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4713
4714         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4715                 ereport(FATAL,
4716                                 (errmsg("database files are incompatible with server"),
4717                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4718                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4719                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4720                                  errhint("It looks like you need to initdb.")));
4721
4722         /* Now check the CRC. */
4723         INIT_CRC32(crc);
4724         COMP_CRC32(crc,
4725                            (char *) ControlFile,
4726                            offsetof(ControlFileData, crc));
4727         FIN_CRC32(crc);
4728
4729         if (!EQ_CRC32(crc, ControlFile->crc))
4730                 ereport(FATAL,
4731                                 (errmsg("incorrect checksum in control file")));
4732
4733         /*
4734          * Do compatibility checking immediately.  If the database isn't
4735          * compatible with the backend executable, we want to abort before we can
4736          * possibly do any damage.
4737          */
4738         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4739                 ereport(FATAL,
4740                                 (errmsg("database files are incompatible with server"),
4741                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4742                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4743                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4744                                  errhint("It looks like you need to initdb.")));
4745         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4746                 ereport(FATAL,
4747                                 (errmsg("database files are incompatible with server"),
4748                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4749                                          " but the server was compiled with MAXALIGN %d.",
4750                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4751                                  errhint("It looks like you need to initdb.")));
4752         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4753                 ereport(FATAL,
4754                                 (errmsg("database files are incompatible with server"),
4755                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4756                                  errhint("It looks like you need to initdb.")));
4757         if (ControlFile->blcksz != BLCKSZ)
4758                 ereport(FATAL,
4759                                 (errmsg("database files are incompatible with server"),
4760                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4761                                            " but the server was compiled with BLCKSZ %d.",
4762                                            ControlFile->blcksz, BLCKSZ),
4763                                  errhint("It looks like you need to recompile or initdb.")));
4764         if (ControlFile->relseg_size != RELSEG_SIZE)
4765                 ereport(FATAL,
4766                                 (errmsg("database files are incompatible with server"),
4767                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4768                                   " but the server was compiled with RELSEG_SIZE %d.",
4769                                   ControlFile->relseg_size, RELSEG_SIZE),
4770                                  errhint("It looks like you need to recompile or initdb.")));
4771         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4772                 ereport(FATAL,
4773                                 (errmsg("database files are incompatible with server"),
4774                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4775                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4776                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4777                                  errhint("It looks like you need to recompile or initdb.")));
4778         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4779                 ereport(FATAL,
4780                                 (errmsg("database files are incompatible with server"),
4781                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4782                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4783                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4784                                  errhint("It looks like you need to recompile or initdb.")));
4785         if (ControlFile->nameDataLen != NAMEDATALEN)
4786                 ereport(FATAL,
4787                                 (errmsg("database files are incompatible with server"),
4788                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4789                                   " but the server was compiled with NAMEDATALEN %d.",
4790                                   ControlFile->nameDataLen, NAMEDATALEN),
4791                                  errhint("It looks like you need to recompile or initdb.")));
4792         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4793                 ereport(FATAL,
4794                                 (errmsg("database files are incompatible with server"),
4795                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4796                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4797                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4798                                  errhint("It looks like you need to recompile or initdb.")));
4799         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4800                 ereport(FATAL,
4801                                 (errmsg("database files are incompatible with server"),
4802                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4803                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4804                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4805                                  errhint("It looks like you need to recompile or initdb.")));
4806
4807 #ifdef HAVE_INT64_TIMESTAMP
4808         if (ControlFile->enableIntTimes != true)
4809                 ereport(FATAL,
4810                                 (errmsg("database files are incompatible with server"),
4811                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4812                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4813                                  errhint("It looks like you need to recompile or initdb.")));
4814 #else
4815         if (ControlFile->enableIntTimes != false)
4816                 ereport(FATAL,
4817                                 (errmsg("database files are incompatible with server"),
4818                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4819                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4820                                  errhint("It looks like you need to recompile or initdb.")));
4821 #endif
4822
4823 #ifdef USE_FLOAT4_BYVAL
4824         if (ControlFile->float4ByVal != true)
4825                 ereport(FATAL,
4826                                 (errmsg("database files are incompatible with server"),
4827                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4828                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4829                                  errhint("It looks like you need to recompile or initdb.")));
4830 #else
4831         if (ControlFile->float4ByVal != false)
4832                 ereport(FATAL,
4833                                 (errmsg("database files are incompatible with server"),
4834                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4835                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4836                                  errhint("It looks like you need to recompile or initdb.")));
4837 #endif
4838
4839 #ifdef USE_FLOAT8_BYVAL
4840         if (ControlFile->float8ByVal != true)
4841                 ereport(FATAL,
4842                                 (errmsg("database files are incompatible with server"),
4843                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4844                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4845                                  errhint("It looks like you need to recompile or initdb.")));
4846 #else
4847         if (ControlFile->float8ByVal != false)
4848                 ereport(FATAL,
4849                                 (errmsg("database files are incompatible with server"),
4850                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4851                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4852                                  errhint("It looks like you need to recompile or initdb.")));
4853 #endif
4854
4855         /* Make the fixed  settings visible as GUC variables, too */
4856         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4857                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4858 }
4859
4860 void
4861 UpdateControlFile(void)
4862 {
4863         int                     fd;
4864
4865         INIT_CRC32(ControlFile->crc);
4866         COMP_CRC32(ControlFile->crc,
4867                            (char *) ControlFile,
4868                            offsetof(ControlFileData, crc));
4869         FIN_CRC32(ControlFile->crc);
4870
4871         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4872                                            O_RDWR | PG_BINARY,
4873                                            S_IRUSR | S_IWUSR);
4874         if (fd < 0)
4875                 ereport(PANIC,
4876                                 (errcode_for_file_access(),
4877                                  errmsg("could not open control file \"%s\": %m",
4878                                                 XLOG_CONTROL_FILE)));
4879
4880         errno = 0;
4881         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4882         {
4883                 /* if write didn't set errno, assume problem is no disk space */
4884                 if (errno == 0)
4885                         errno = ENOSPC;
4886                 ereport(PANIC,
4887                                 (errcode_for_file_access(),
4888                                  errmsg("could not write to control file: %m")));
4889         }
4890
4891         if (pg_fsync(fd) != 0)
4892                 ereport(PANIC,
4893                                 (errcode_for_file_access(),
4894                                  errmsg("could not fsync control file: %m")));
4895
4896         if (close(fd))
4897                 ereport(PANIC,
4898                                 (errcode_for_file_access(),
4899                                  errmsg("could not close control file: %m")));
4900 }
4901
4902 /*
4903  * Returns the unique system identifier from control file.
4904  */
4905 uint64
4906 GetSystemIdentifier(void)
4907 {
4908         Assert(ControlFile != NULL);
4909         return ControlFile->system_identifier;
4910 }
4911
4912 /*
4913  * Are checksums enabled for data pages?
4914  */
4915 bool
4916 DataChecksumsEnabled(void)
4917 {
4918         Assert(ControlFile != NULL);
4919         return (ControlFile->data_checksum_version > 0);
4920 }
4921
4922 /*
4923  * Returns a fake LSN for unlogged relations.
4924  *
4925  * Each call generates an LSN that is greater than any previous value
4926  * returned. The current counter value is saved and restored across clean
4927  * shutdowns, but like unlogged relations, does not survive a crash. This can
4928  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4929  * LSN-like increasing sequence of numbers without writing any WAL.
4930  */
4931 XLogRecPtr
4932 GetFakeLSNForUnloggedRel(void)
4933 {
4934         XLogRecPtr      nextUnloggedLSN;
4935
4936         /* use volatile pointer to prevent code rearrangement */
4937         volatile XLogCtlData *xlogctl = XLogCtl;
4938
4939         /* increment the unloggedLSN counter, need SpinLock */
4940         SpinLockAcquire(&xlogctl->ulsn_lck);
4941         nextUnloggedLSN = xlogctl->unloggedLSN++;
4942         SpinLockRelease(&xlogctl->ulsn_lck);
4943
4944         return nextUnloggedLSN;
4945 }
4946
4947 /*
4948  * Auto-tune the number of XLOG buffers.
4949  *
4950  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4951  * a maximum of one XLOG segment (there is little reason to think that more
4952  * is helpful, at least so long as we force an fsync when switching log files)
4953  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4954  * 9.1, when auto-tuning was added).
4955  *
4956  * This should not be called until NBuffers has received its final value.
4957  */
4958 static int
4959 XLOGChooseNumBuffers(void)
4960 {
4961         int                     xbuffers;
4962
4963         xbuffers = NBuffers / 32;
4964         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4965                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4966         if (xbuffers < 8)
4967                 xbuffers = 8;
4968         return xbuffers;
4969 }
4970
4971 /*
4972  * GUC check_hook for wal_buffers
4973  */
4974 bool
4975 check_wal_buffers(int *newval, void **extra, GucSource source)
4976 {
4977         /*
4978          * -1 indicates a request for auto-tune.
4979          */
4980         if (*newval == -1)
4981         {
4982                 /*
4983                  * If we haven't yet changed the boot_val default of -1, just let it
4984                  * be.  We'll fix it when XLOGShmemSize is called.
4985                  */
4986                 if (XLOGbuffers == -1)
4987                         return true;
4988
4989                 /* Otherwise, substitute the auto-tune value */
4990                 *newval = XLOGChooseNumBuffers();
4991         }
4992
4993         /*
4994          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4995          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4996          * the case, we just silently treat such values as a request for the
4997          * minimum.  (We could throw an error instead, but that doesn't seem very
4998          * helpful.)
4999          */
5000         if (*newval < 4)
5001                 *newval = 4;
5002
5003         return true;
5004 }
5005
5006 /*
5007  * Initialization of shared memory for XLOG
5008  */
5009 Size
5010 XLOGShmemSize(void)
5011 {
5012         Size            size;
5013
5014         /*
5015          * If the value of wal_buffers is -1, use the preferred auto-tune value.
5016          * This isn't an amazingly clean place to do this, but we must wait till
5017          * NBuffers has received its final value, and must do it before using the
5018          * value of XLOGbuffers to do anything important.
5019          */
5020         if (XLOGbuffers == -1)
5021         {
5022                 char            buf[32];
5023
5024                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5025                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5026         }
5027         Assert(XLOGbuffers > 0);
5028
5029         /* XLogCtl */
5030         size = sizeof(XLogCtlData);
5031
5032         /* xlog insertion slots, plus alignment */
5033         size = add_size(size, mul_size(sizeof(XLogInsertSlotPadded), num_xloginsert_slots + 1));
5034         /* xlblocks array */
5035         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5036         /* extra alignment padding for XLOG I/O buffers */
5037         size = add_size(size, XLOG_BLCKSZ);
5038         /* and the buffers themselves */
5039         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5040
5041         /*
5042          * Note: we don't count ControlFileData, it comes out of the "slop factor"
5043          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5044          * routine again below to compute the actual allocation size.
5045          */
5046
5047         return size;
5048 }
5049
5050 void
5051 XLOGShmemInit(void)
5052 {
5053         bool            foundCFile,
5054                                 foundXLog;
5055         char       *allocptr;
5056         int                     i;
5057
5058         ControlFile = (ControlFileData *)
5059                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5060         XLogCtl = (XLogCtlData *)
5061                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5062
5063         if (foundCFile || foundXLog)
5064         {
5065                 /* both should be present or neither */
5066                 Assert(foundCFile && foundXLog);
5067                 return;
5068         }
5069         memset(XLogCtl, 0, sizeof(XLogCtlData));
5070
5071         /*
5072          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5073          * multiple of the alignment for same, so no extra alignment padding is
5074          * needed here.
5075          */
5076         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5077         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5078         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5079         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5080
5081         /* Xlog insertion slots. Ensure they're aligned to the full padded size */
5082         allocptr += sizeof(XLogInsertSlotPadded) -
5083                 ((uintptr_t) allocptr) % sizeof(XLogInsertSlotPadded);
5084         XLogCtl->Insert.insertSlots = (XLogInsertSlotPadded *) allocptr;
5085         allocptr += sizeof(XLogInsertSlotPadded) * num_xloginsert_slots;
5086
5087         /*
5088          * Align the start of the page buffers to a full xlog block size boundary.
5089          * This simplifies some calculations in XLOG insertion. It is also required
5090          * for O_DIRECT.
5091          */
5092         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5093         XLogCtl->pages = allocptr;
5094         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5095
5096         /*
5097          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5098          * in additional info.)
5099          */
5100         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5101         XLogCtl->SharedRecoveryInProgress = true;
5102         XLogCtl->SharedHotStandbyActive = false;
5103         XLogCtl->WalWriterSleeping = false;
5104
5105         for (i = 0; i < num_xloginsert_slots; i++)
5106         {
5107                 XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
5108                 SpinLockInit(&slot->mutex);
5109                 slot->xlogInsertingAt = InvalidXLogRecPtr;
5110                 slot->owner = NULL;
5111
5112                 slot->releaseOK = true;
5113                 slot->exclusive = 0;
5114                 slot->head = NULL;
5115                 slot->tail = NULL;
5116         }
5117
5118         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5119         SpinLockInit(&XLogCtl->info_lck);
5120         SpinLockInit(&XLogCtl->ulsn_lck);
5121         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5122
5123         /*
5124          * If we are not in bootstrap mode, pg_control should already exist. Read
5125          * and validate it immediately (see comments in ReadControlFile() for the
5126          * reasons why).
5127          */
5128         if (!IsBootstrapProcessingMode())
5129                 ReadControlFile();
5130 }
5131
5132 /*
5133  * This func must be called ONCE on system install.  It creates pg_control
5134  * and the initial XLOG segment.
5135  */
5136 void
5137 BootStrapXLOG(void)
5138 {
5139         CheckPoint      checkPoint;
5140         char       *buffer;
5141         XLogPageHeader page;
5142         XLogLongPageHeader longpage;
5143         XLogRecord *record;
5144         bool            use_existent;
5145         uint64          sysidentifier;
5146         struct timeval tv;
5147         pg_crc32        crc;
5148
5149         /*
5150          * Select a hopefully-unique system identifier code for this installation.
5151          * We use the result of gettimeofday(), including the fractional seconds
5152          * field, as being about as unique as we can easily get.  (Think not to
5153          * use random(), since it hasn't been seeded and there's no portable way
5154          * to seed it other than the system clock value...)  The upper half of the
5155          * uint64 value is just the tv_sec part, while the lower half is the XOR
5156          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
5157          * unnecessarily if "uint64" is really only 32 bits wide.  A person
5158          * knowing this encoding can determine the initialization time of the
5159          * installation, which could perhaps be useful sometimes.
5160          */
5161         gettimeofday(&tv, NULL);
5162         sysidentifier = ((uint64) tv.tv_sec) << 32;
5163         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
5164
5165         /* First timeline ID is always 1 */
5166         ThisTimeLineID = 1;
5167
5168         /* page buffer must be aligned suitably for O_DIRECT */
5169         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5170         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5171         memset(page, 0, XLOG_BLCKSZ);
5172
5173         /*
5174          * Set up information for the initial checkpoint record
5175          *
5176          * The initial checkpoint record is written to the beginning of the WAL
5177          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5178          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5179          */
5180         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
5181         checkPoint.ThisTimeLineID = ThisTimeLineID;
5182         checkPoint.PrevTimeLineID = ThisTimeLineID;
5183         checkPoint.fullPageWrites = fullPageWrites;
5184         checkPoint.nextXidEpoch = 0;
5185         checkPoint.nextXid = FirstNormalTransactionId;
5186         checkPoint.nextOid = FirstBootstrapObjectId;
5187         checkPoint.nextMulti = FirstMultiXactId;
5188         checkPoint.nextMultiOffset = 0;
5189         checkPoint.oldestXid = FirstNormalTransactionId;
5190         checkPoint.oldestXidDB = TemplateDbOid;
5191         checkPoint.oldestMulti = FirstMultiXactId;
5192         checkPoint.oldestMultiDB = TemplateDbOid;
5193         checkPoint.time = (pg_time_t) time(NULL);
5194         checkPoint.oldestActiveXid = InvalidTransactionId;
5195
5196         ShmemVariableCache->nextXid = checkPoint.nextXid;
5197         ShmemVariableCache->nextOid = checkPoint.nextOid;
5198         ShmemVariableCache->oidCount = 0;
5199         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5200         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5201         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
5202
5203         /* Set up the XLOG page header */
5204         page->xlp_magic = XLOG_PAGE_MAGIC;
5205         page->xlp_info = XLP_LONG_HEADER;
5206         page->xlp_tli = ThisTimeLineID;
5207         page->xlp_pageaddr = XLogSegSize;
5208         longpage = (XLogLongPageHeader) page;
5209         longpage->xlp_sysid = sysidentifier;
5210         longpage->xlp_seg_size = XLogSegSize;
5211         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5212
5213         /* Insert the initial checkpoint record */
5214         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
5215         record->xl_prev = 0;
5216         record->xl_xid = InvalidTransactionId;
5217         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
5218         record->xl_len = sizeof(checkPoint);
5219         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5220         record->xl_rmid = RM_XLOG_ID;
5221         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
5222
5223         INIT_CRC32(crc);
5224         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
5225         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5226         FIN_CRC32(crc);
5227         record->xl_crc = crc;
5228
5229         /* Create first XLOG segment file */
5230         use_existent = false;
5231         openLogFile = XLogFileInit(1, &use_existent, false);
5232
5233         /* Write the first page with the initial record */
5234         errno = 0;
5235         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5236         {
5237                 /* if write didn't set errno, assume problem is no disk space */
5238                 if (errno == 0)
5239                         errno = ENOSPC;
5240                 ereport(PANIC,
5241                                 (errcode_for_file_access(),
5242                           errmsg("could not write bootstrap transaction log file: %m")));
5243         }
5244
5245         if (pg_fsync(openLogFile) != 0)
5246                 ereport(PANIC,
5247                                 (errcode_for_file_access(),
5248                           errmsg("could not fsync bootstrap transaction log file: %m")));
5249
5250         if (close(openLogFile))
5251                 ereport(PANIC,
5252                                 (errcode_for_file_access(),
5253                           errmsg("could not close bootstrap transaction log file: %m")));
5254
5255         openLogFile = -1;
5256
5257         /* Now create pg_control */
5258
5259         memset(ControlFile, 0, sizeof(ControlFileData));
5260         /* Initialize pg_control status fields */
5261         ControlFile->system_identifier = sysidentifier;
5262         ControlFile->state = DB_SHUTDOWNED;
5263         ControlFile->time = checkPoint.time;
5264         ControlFile->checkPoint = checkPoint.redo;
5265         ControlFile->checkPointCopy = checkPoint;
5266         ControlFile->unloggedLSN = 1;
5267
5268         /* Set important parameter values for use when replaying WAL */
5269         ControlFile->MaxConnections = MaxConnections;
5270         ControlFile->max_worker_processes = max_worker_processes;
5271         ControlFile->max_prepared_xacts = max_prepared_xacts;
5272         ControlFile->max_locks_per_xact = max_locks_per_xact;
5273         ControlFile->wal_level = wal_level;
5274         ControlFile->wal_log_hints = wal_log_hints;
5275         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5276
5277         /* some additional ControlFile fields are set in WriteControlFile() */
5278
5279         WriteControlFile();
5280
5281         /* Bootstrap the commit log, too */
5282         BootStrapCLOG();
5283         BootStrapSUBTRANS();
5284         BootStrapMultiXact();
5285
5286         pfree(buffer);
5287 }
5288
5289 static char *
5290 str_time(pg_time_t tnow)
5291 {
5292         static char buf[128];
5293
5294         pg_strftime(buf, sizeof(buf),
5295                                 "%Y-%m-%d %H:%M:%S %Z",
5296                                 pg_localtime(&tnow, log_timezone));
5297
5298         return buf;
5299 }
5300
5301 /*
5302  * See if there is a recovery command file (recovery.conf), and if so
5303  * read in parameters for archive recovery and XLOG streaming.
5304  *
5305  * The file is parsed using the main configuration parser.
5306  */
5307 static void
5308 readRecoveryCommandFile(void)
5309 {
5310         FILE       *fd;
5311         TimeLineID      rtli = 0;
5312         bool            rtliGiven = false;
5313         ConfigVariable *item,
5314                            *head = NULL,
5315                            *tail = NULL;
5316
5317         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5318         if (fd == NULL)
5319         {
5320                 if (errno == ENOENT)
5321                         return;                         /* not there, so no archive recovery */
5322                 ereport(FATAL,
5323                                 (errcode_for_file_access(),
5324                                  errmsg("could not open recovery command file \"%s\": %m",
5325                                                 RECOVERY_COMMAND_FILE)));
5326         }
5327
5328         /*
5329          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5330          * no need to check the return value.
5331          */
5332         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5333
5334         FreeFile(fd);
5335
5336         for (item = head; item; item = item->next)
5337         {
5338                 if (strcmp(item->name, "restore_command") == 0)
5339                 {
5340                         recoveryRestoreCommand = pstrdup(item->value);
5341                         ereport(DEBUG2,
5342                                         (errmsg_internal("restore_command = '%s'",
5343                                                                          recoveryRestoreCommand)));
5344                 }
5345                 else if (strcmp(item->name, "recovery_end_command") == 0)
5346                 {
5347                         recoveryEndCommand = pstrdup(item->value);
5348                         ereport(DEBUG2,
5349                                         (errmsg_internal("recovery_end_command = '%s'",
5350                                                                          recoveryEndCommand)));
5351                 }
5352                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5353                 {
5354                         archiveCleanupCommand = pstrdup(item->value);
5355                         ereport(DEBUG2,
5356                                         (errmsg_internal("archive_cleanup_command = '%s'",
5357                                                                          archiveCleanupCommand)));
5358                 }
5359                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
5360                 {
5361                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
5362                                 ereport(ERROR,
5363                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5364                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
5365                         ereport(DEBUG2,
5366                                         (errmsg_internal("pause_at_recovery_target = '%s'",
5367                                                                          item->value)));
5368                 }
5369                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5370                 {
5371                         rtliGiven = true;
5372                         if (strcmp(item->value, "latest") == 0)
5373                                 rtli = 0;
5374                         else
5375                         {
5376                                 errno = 0;
5377                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5378                                 if (errno == EINVAL || errno == ERANGE)
5379                                         ereport(FATAL,
5380                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5381                                                                         item->value)));
5382                         }
5383                         if (rtli)
5384                                 ereport(DEBUG2,
5385                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
5386                         else
5387                                 ereport(DEBUG2,
5388                                          (errmsg_internal("recovery_target_timeline = latest")));
5389                 }
5390                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5391                 {
5392                         errno = 0;
5393                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5394                         if (errno == EINVAL || errno == ERANGE)
5395                                 ereport(FATAL,
5396                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5397                                                  item->value)));
5398                         ereport(DEBUG2,
5399                                         (errmsg_internal("recovery_target_xid = %u",
5400                                                                          recoveryTargetXid)));
5401                         recoveryTarget = RECOVERY_TARGET_XID;
5402                 }
5403                 else if (strcmp(item->name, "recovery_target_time") == 0)
5404                 {
5405                         /*
5406                          * if recovery_target_xid or recovery_target_name specified, then
5407                          * this overrides recovery_target_time
5408                          */
5409                         if (recoveryTarget == RECOVERY_TARGET_XID ||
5410                                 recoveryTarget == RECOVERY_TARGET_NAME)
5411                                 continue;
5412                         recoveryTarget = RECOVERY_TARGET_TIME;
5413
5414                         /*
5415                          * Convert the time string given by the user to TimestampTz form.
5416                          */
5417                         recoveryTargetTime =
5418                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5419                                                                                                 CStringGetDatum(item->value),
5420                                                                                                 ObjectIdGetDatum(InvalidOid),
5421                                                                                                                 Int32GetDatum(-1)));
5422                         ereport(DEBUG2,
5423                                         (errmsg_internal("recovery_target_time = '%s'",
5424                                                                    timestamptz_to_str(recoveryTargetTime))));
5425                 }
5426                 else if (strcmp(item->name, "recovery_target_name") == 0)
5427                 {
5428                         /*
5429                          * if recovery_target_xid specified, then this overrides
5430                          * recovery_target_name
5431                          */
5432                         if (recoveryTarget == RECOVERY_TARGET_XID)
5433                                 continue;
5434                         recoveryTarget = RECOVERY_TARGET_NAME;
5435
5436                         recoveryTargetName = pstrdup(item->value);
5437                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5438                                 ereport(FATAL,
5439                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5440                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5441                                                                 MAXFNAMELEN - 1)));
5442
5443                         ereport(DEBUG2,
5444                                         (errmsg_internal("recovery_target_name = '%s'",
5445                                                                          recoveryTargetName)));
5446                 }
5447                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5448                 {
5449                         /*
5450                          * does nothing if a recovery_target is not also set
5451                          */
5452                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5453                                 ereport(ERROR,
5454                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5455                                                  errmsg("parameter \"%s\" requires a Boolean value",
5456                                                                 "recovery_target_inclusive")));
5457                         ereport(DEBUG2,
5458                                         (errmsg_internal("recovery_target_inclusive = %s",
5459                                                                          item->value)));
5460                 }
5461                 else if (strcmp(item->name, "standby_mode") == 0)
5462                 {
5463                         if (!parse_bool(item->value, &StandbyModeRequested))
5464                                 ereport(ERROR,
5465                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5466                                                  errmsg("parameter \"%s\" requires a Boolean value",
5467                                                                 "standby_mode")));
5468                         ereport(DEBUG2,
5469                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5470                 }
5471                 else if (strcmp(item->name, "primary_conninfo") == 0)
5472                 {
5473                         PrimaryConnInfo = pstrdup(item->value);
5474                         ereport(DEBUG2,
5475                                         (errmsg_internal("primary_conninfo = '%s'",
5476                                                                          PrimaryConnInfo)));
5477                 }
5478                 else if (strcmp(item->name, "trigger_file") == 0)
5479                 {
5480                         TriggerFile = pstrdup(item->value);
5481                         ereport(DEBUG2,
5482                                         (errmsg_internal("trigger_file = '%s'",
5483                                                                          TriggerFile)));
5484                 }
5485                 else if (strcmp(item->name, "min_recovery_apply_delay") == 0)
5486                 {
5487                         const char *hintmsg;
5488
5489                         if (!parse_int(item->value, &min_recovery_apply_delay, GUC_UNIT_MS,
5490                                         &hintmsg))
5491                                 ereport(ERROR,
5492                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5493                                                  errmsg("parameter \"%s\" requires a temporal value", "min_recovery_apply_delay"),
5494                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
5495                         ereport(DEBUG2,
5496                                         (errmsg("min_recovery_apply_delay = '%s'", item->value)));
5497                 }
5498                 else
5499                         ereport(FATAL,
5500                                         (errmsg("unrecognized recovery parameter \"%s\"",
5501                                                         item->name)));
5502         }
5503
5504         /*
5505          * Check for compulsory parameters
5506          */
5507         if (StandbyModeRequested)
5508         {
5509                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5510                         ereport(WARNING,
5511                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5512                                                         RECOVERY_COMMAND_FILE),
5513                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5514         }
5515         else
5516         {
5517                 if (recoveryRestoreCommand == NULL)
5518                         ereport(FATAL,
5519                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5520                                                         RECOVERY_COMMAND_FILE)));
5521         }
5522
5523         /* Enable fetching from archive recovery area */
5524         ArchiveRecoveryRequested = true;
5525
5526         /*
5527          * If user specified recovery_target_timeline, validate it or compute the
5528          * "latest" value.      We can't do this until after we've gotten the restore
5529          * command and set InArchiveRecovery, because we need to fetch timeline
5530          * history files from the archive.
5531          */
5532         if (rtliGiven)
5533         {
5534                 if (rtli)
5535                 {
5536                         /* Timeline 1 does not have a history file, all else should */
5537                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5538                                 ereport(FATAL,
5539                                                 (errmsg("recovery target timeline %u does not exist",
5540                                                                 rtli)));
5541                         recoveryTargetTLI = rtli;
5542                         recoveryTargetIsLatest = false;
5543                 }
5544                 else
5545                 {
5546                         /* We start the "latest" search from pg_control's timeline */
5547                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5548                         recoveryTargetIsLatest = true;
5549                 }
5550         }
5551
5552         FreeConfigVariables(head);
5553 }
5554
5555 /*
5556  * Exit archive-recovery state
5557  */
5558 static void
5559 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
5560 {
5561         char            recoveryPath[MAXPGPATH];
5562         char            xlogpath[MAXPGPATH];
5563
5564         /*
5565          * We are no longer in archive recovery state.
5566          */
5567         InArchiveRecovery = false;
5568
5569         /*
5570          * Update min recovery point one last time.
5571          */
5572         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5573
5574         /*
5575          * If the ending log segment is still open, close it (to avoid problems on
5576          * Windows with trying to rename or delete an open file).
5577          */
5578         if (readFile >= 0)
5579         {
5580                 close(readFile);
5581                 readFile = -1;
5582         }
5583
5584         /*
5585          * If we are establishing a new timeline, we have to copy data from the
5586          * last WAL segment of the old timeline to create a starting WAL segment
5587          * for the new timeline.
5588          *
5589          * Notify the archiver that the last WAL segment of the old timeline is
5590          * ready to copy to archival storage. Otherwise, it is not archived for a
5591          * while.
5592          */
5593         if (endTLI != ThisTimeLineID)
5594         {
5595                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
5596
5597                 if (XLogArchivingActive())
5598                 {
5599                         XLogFileName(xlogpath, endTLI, endLogSegNo);
5600                         XLogArchiveNotify(xlogpath);
5601                 }
5602         }
5603
5604         /*
5605          * Let's just make real sure there are not .ready or .done flags posted
5606          * for the new segment.
5607          */
5608         XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
5609         XLogArchiveCleanup(xlogpath);
5610
5611         /*
5612          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5613          * of it.
5614          */
5615         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5616         unlink(recoveryPath);           /* ignore any error */
5617
5618         /* Get rid of any remaining recovered timeline-history file, too */
5619         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5620         unlink(recoveryPath);           /* ignore any error */
5621
5622         /*
5623          * Rename the config file out of the way, so that we don't accidentally
5624          * re-enter archive recovery mode in a subsequent crash.
5625          */
5626         unlink(RECOVERY_COMMAND_DONE);
5627         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5628                 ereport(FATAL,
5629                                 (errcode_for_file_access(),
5630                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5631                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5632
5633         ereport(LOG,
5634                         (errmsg("archive recovery complete")));
5635 }
5636
5637 /*
5638  * For point-in-time recovery, this function decides whether we want to
5639  * stop applying the XLOG at or after the current record.
5640  *
5641  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
5642  * *includeThis is set TRUE if we should apply this record before stopping.
5643  *
5644  * We also track the timestamp of the latest applied COMMIT/ABORT
5645  * record in XLogCtl->recoveryLastXTime, for logging purposes.
5646  * Also, some information is saved in recoveryStopXid et al for use in
5647  * annotating the new timeline's history file; and recoveryDelayUntilTime
5648  * is updated, for time-delayed standbys.
5649  */
5650 static bool
5651 recoveryStopsHere(XLogRecord *record, bool *includeThis, bool *delayThis)
5652 {
5653         bool            stopsHere;
5654         uint8           record_info;
5655         TimestampTz recordXtime;
5656         char            recordRPName[MAXFNAMELEN];
5657
5658         /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
5659         if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
5660                 return false;
5661         record_info = record->xl_info & ~XLR_INFO_MASK;
5662         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5663         {
5664                 xl_xact_commit_compact *recordXactCommitData;
5665
5666                 recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
5667                 recordXtime = recordXactCommitData->xact_time;
5668
5669                 *delayThis = SetRecoveryDelayUntilTime(recordXactCommitData->xact_time);
5670         }
5671         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5672         {
5673                 xl_xact_commit *recordXactCommitData;
5674
5675                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
5676                 recordXtime = recordXactCommitData->xact_time;
5677
5678                 *delayThis = SetRecoveryDelayUntilTime(recordXactCommitData->xact_time);
5679         }
5680         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5681         {
5682                 xl_xact_abort *recordXactAbortData;
5683
5684                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
5685                 recordXtime = recordXactAbortData->xact_time;
5686
5687                 /*
5688                  * We deliberately choose not to delay aborts since they have no
5689                  * effect on MVCC. We already allow replay of records that don't
5690                  * have a timestamp, so there is already opportunity for issues
5691                  * caused by early conflicts on standbys.
5692                  */
5693         }
5694         else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5695         {
5696                 xl_restore_point *recordRestorePointData;
5697
5698                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5699                 recordXtime = recordRestorePointData->rp_time;
5700                 strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
5701
5702                 *delayThis = SetRecoveryDelayUntilTime(recordRestorePointData->rp_time);
5703         }
5704         else
5705                 return false;
5706
5707         /* Do we have a PITR target at all? */
5708         if (recoveryTarget == RECOVERY_TARGET_UNSET)
5709         {
5710                 /*
5711                  * Save timestamp of latest transaction commit/abort if this is a
5712                  * transaction record
5713                  */
5714                 if (record->xl_rmid == RM_XACT_ID)
5715                         SetLatestXTime(recordXtime);
5716                 return false;
5717         }
5718
5719         if (recoveryTarget == RECOVERY_TARGET_XID)
5720         {
5721                 /*
5722                  * There can be only one transaction end record with this exact
5723                  * transactionid
5724                  *
5725                  * when testing for an xid, we MUST test for equality only, since
5726                  * transactions are numbered in the order they start, not the order
5727                  * they complete. A higher numbered xid will complete before you about
5728                  * 50% of the time...
5729                  */
5730                 stopsHere = (record->xl_xid == recoveryTargetXid);
5731                 if (stopsHere)
5732                         *includeThis = recoveryTargetInclusive;
5733         }
5734         else if (recoveryTarget == RECOVERY_TARGET_NAME)
5735         {
5736                 /*
5737                  * There can be many restore points that share the same name, so we
5738                  * stop at the first one
5739                  */
5740                 stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
5741
5742                 /*
5743                  * Ignore recoveryTargetInclusive because this is not a transaction
5744                  * record
5745                  */
5746                 *includeThis = false;
5747         }
5748         else
5749         {
5750                 /*
5751                  * There can be many transactions that share the same commit time, so
5752                  * we stop after the last one, if we are inclusive, or stop at the
5753                  * first one if we are exclusive
5754                  */
5755                 if (recoveryTargetInclusive)
5756                         stopsHere = (recordXtime > recoveryTargetTime);
5757                 else
5758                         stopsHere = (recordXtime >= recoveryTargetTime);
5759                 if (stopsHere)
5760                         *includeThis = false;
5761         }
5762
5763         if (stopsHere)
5764         {
5765                 recoveryStopXid = record->xl_xid;
5766                 recoveryStopTime = recordXtime;
5767                 recoveryStopAfter = *includeThis;
5768
5769                 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5770                 {
5771                         if (recoveryStopAfter)
5772                                 ereport(LOG,
5773                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5774                                                                 recoveryStopXid,
5775                                                                 timestamptz_to_str(recoveryStopTime))));
5776                         else
5777                                 ereport(LOG,
5778                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
5779                                                                 recoveryStopXid,
5780                                                                 timestamptz_to_str(recoveryStopTime))));
5781                 }
5782                 else if (record_info == XLOG_XACT_ABORT)
5783                 {
5784                         if (recoveryStopAfter)
5785                                 ereport(LOG,
5786                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5787                                                                 recoveryStopXid,
5788                                                                 timestamptz_to_str(recoveryStopTime))));
5789                         else
5790                                 ereport(LOG,
5791                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
5792                                                                 recoveryStopXid,
5793                                                                 timestamptz_to_str(recoveryStopTime))));
5794                 }
5795                 else
5796                 {
5797                         strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
5798
5799                         ereport(LOG,
5800                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5801                                                 recoveryStopName,
5802                                                 timestamptz_to_str(recoveryStopTime))));
5803                 }
5804
5805                 /*
5806                  * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
5807                  * restore point since they are timestamped, though the latest
5808                  * transaction time is not updated.
5809                  */
5810                 if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
5811                         SetLatestXTime(recordXtime);
5812         }
5813         else if (record->xl_rmid == RM_XACT_ID)
5814                 SetLatestXTime(recordXtime);
5815
5816         return stopsHere;
5817 }
5818
5819 /*
5820  * Wait until shared recoveryPause flag is cleared.
5821  *
5822  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5823  * Probably not worth the trouble though.  This state shouldn't be one that
5824  * anyone cares about server power consumption in.
5825  */
5826 static void
5827 recoveryPausesHere(void)
5828 {
5829         /* Don't pause unless users can connect! */
5830         if (!LocalHotStandbyActive)
5831                 return;
5832
5833         ereport(LOG,
5834                         (errmsg("recovery has paused"),
5835                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5836
5837         while (RecoveryIsPaused())
5838         {
5839                 pg_usleep(1000000L);    /* 1000 ms */
5840                 HandleStartupProcInterrupts();
5841         }
5842 }
5843
5844 bool
5845 RecoveryIsPaused(void)
5846 {
5847         /* use volatile pointer to prevent code rearrangement */
5848         volatile XLogCtlData *xlogctl = XLogCtl;
5849         bool            recoveryPause;
5850
5851         SpinLockAcquire(&xlogctl->info_lck);
5852         recoveryPause = xlogctl->recoveryPause;
5853         SpinLockRelease(&xlogctl->info_lck);
5854
5855         return recoveryPause;
5856 }
5857
5858 void
5859 SetRecoveryPause(bool recoveryPause)
5860 {
5861         /* use volatile pointer to prevent code rearrangement */
5862         volatile XLogCtlData *xlogctl = XLogCtl;
5863
5864         SpinLockAcquire(&xlogctl->info_lck);
5865         xlogctl->recoveryPause = recoveryPause;
5866         SpinLockRelease(&xlogctl->info_lck);
5867 }
5868
5869 static bool
5870 SetRecoveryDelayUntilTime(TimestampTz xtime)
5871 {
5872         if (min_recovery_apply_delay != 0)
5873         {
5874                 recoveryDelayUntilTime =
5875                         TimestampTzPlusMilliseconds(xtime, min_recovery_apply_delay);
5876
5877                 return true;
5878         }
5879
5880         return false;
5881 }
5882 /*
5883  * When min_recovery_apply_delay is set, we wait long enough to make sure
5884  * certain record types are applied at least that interval behind the master.
5885  * See recoveryStopsHere().
5886  *
5887  * Note that the delay is calculated between the WAL record log time and
5888  * the current time on standby. We would prefer to keep track of when this
5889  * standby received each WAL record, which would allow a more consistent
5890  * approach and one not affected by time synchronisation issues, but that
5891  * is significantly more effort and complexity for little actual gain in
5892  * usability.
5893  */
5894 static void
5895 recoveryApplyDelay(void)
5896 {
5897         while (true)
5898         {
5899                 long    secs;
5900                 int             microsecs;
5901
5902                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
5903
5904                 /* might change the trigger file's location */
5905                 HandleStartupProcInterrupts();
5906
5907                 if (CheckForStandbyTrigger())
5908                         break;
5909
5910                 /*
5911                  * Wait for difference between GetCurrentTimestamp() and
5912                  * recoveryDelayUntilTime
5913                  */
5914                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
5915                                                         &secs, &microsecs);
5916
5917                 if (secs <= 0 && microsecs <=0)
5918                         break;
5919
5920                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
5921                         secs, microsecs / 1000);
5922
5923                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
5924                                         WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
5925                                         secs * 1000L + microsecs / 1000);
5926         }
5927 }
5928
5929 /*
5930  * Save timestamp of latest processed commit/abort record.
5931  *
5932  * We keep this in XLogCtl, not a simple static variable, so that it can be
5933  * seen by processes other than the startup process.  Note in particular
5934  * that CreateRestartPoint is executed in the checkpointer.
5935  */
5936 static void
5937 SetLatestXTime(TimestampTz xtime)
5938 {
5939         /* use volatile pointer to prevent code rearrangement */
5940         volatile XLogCtlData *xlogctl = XLogCtl;
5941
5942         SpinLockAcquire(&xlogctl->info_lck);
5943         xlogctl->recoveryLastXTime = xtime;
5944         SpinLockRelease(&xlogctl->info_lck);
5945 }
5946
5947 /*
5948  * Fetch timestamp of latest processed commit/abort record.
5949  */
5950 TimestampTz
5951 GetLatestXTime(void)
5952 {
5953         /* use volatile pointer to prevent code rearrangement */
5954         volatile XLogCtlData *xlogctl = XLogCtl;
5955         TimestampTz xtime;
5956
5957         SpinLockAcquire(&xlogctl->info_lck);
5958         xtime = xlogctl->recoveryLastXTime;
5959         SpinLockRelease(&xlogctl->info_lck);
5960
5961         return xtime;
5962 }
5963
5964 /*
5965  * Save timestamp of the next chunk of WAL records to apply.
5966  *
5967  * We keep this in XLogCtl, not a simple static variable, so that it can be
5968  * seen by all backends.
5969  */
5970 static void
5971 SetCurrentChunkStartTime(TimestampTz xtime)
5972 {
5973         /* use volatile pointer to prevent code rearrangement */
5974         volatile XLogCtlData *xlogctl = XLogCtl;
5975
5976         SpinLockAcquire(&xlogctl->info_lck);
5977         xlogctl->currentChunkStartTime = xtime;
5978         SpinLockRelease(&xlogctl->info_lck);
5979 }
5980
5981 /*
5982  * Fetch timestamp of latest processed commit/abort record.
5983  * Startup process maintains an accurate local copy in XLogReceiptTime
5984  */
5985 TimestampTz
5986 GetCurrentChunkReplayStartTime(void)
5987 {
5988         /* use volatile pointer to prevent code rearrangement */
5989         volatile XLogCtlData *xlogctl = XLogCtl;
5990         TimestampTz xtime;
5991
5992         SpinLockAcquire(&xlogctl->info_lck);
5993         xtime = xlogctl->currentChunkStartTime;
5994         SpinLockRelease(&xlogctl->info_lck);
5995
5996         return xtime;
5997 }
5998
5999 /*
6000  * Returns time of receipt of current chunk of XLOG data, as well as
6001  * whether it was received from streaming replication or from archives.
6002  */
6003 void
6004 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6005 {
6006         /*
6007          * This must be executed in the startup process, since we don't export the
6008          * relevant state to shared memory.
6009          */
6010         Assert(InRecovery);
6011
6012         *rtime = XLogReceiptTime;
6013         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6014 }
6015
6016 /*
6017  * Note that text field supplied is a parameter name and does not require
6018  * translation
6019  */
6020 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6021 do { \
6022         if ((currValue) < (minValue)) \
6023                 ereport(ERROR, \
6024                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6025                                  errmsg("hot standby is not possible because " \
6026                                                 "%s = %d is a lower setting than on the master server " \
6027                                                 "(its value was %d)", \
6028                                                 param_name, \
6029                                                 currValue, \
6030                                                 minValue))); \
6031 } while(0)
6032
6033 /*
6034  * Check to see if required parameters are set high enough on this server
6035  * for various aspects of recovery operation.
6036  */
6037 static void
6038 CheckRequiredParameterValues(void)
6039 {
6040         /*
6041          * For archive recovery, the WAL must be generated with at least 'archive'
6042          * wal_level.
6043          */
6044         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6045         {
6046                 ereport(WARNING,
6047                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6048                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6049         }
6050
6051         /*
6052          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
6053          * we must have at least as many backend slots as the primary.
6054          */
6055         if (InArchiveRecovery && EnableHotStandby)
6056         {
6057                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
6058                         ereport(ERROR,
6059                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on the master server"),
6060                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
6061
6062                 /* We ignore autovacuum_max_workers when we make this test. */
6063                 RecoveryRequiresIntParameter("max_connections",
6064                                                                          MaxConnections,
6065                                                                          ControlFile->MaxConnections);
6066                 RecoveryRequiresIntParameter("max_worker_processes",
6067                                                                          max_worker_processes,
6068                                                                          ControlFile->max_worker_processes);
6069                 RecoveryRequiresIntParameter("max_prepared_transactions",
6070                                                                          max_prepared_xacts,
6071                                                                          ControlFile->max_prepared_xacts);
6072                 RecoveryRequiresIntParameter("max_locks_per_transaction",
6073                                                                          max_locks_per_xact,
6074                                                                          ControlFile->max_locks_per_xact);
6075         }
6076 }
6077
6078 /*
6079  * This must be called ONCE during postmaster or standalone-backend startup
6080  */
6081 void
6082 StartupXLOG(void)
6083 {
6084         XLogCtlInsert *Insert;
6085         CheckPoint      checkPoint;
6086         bool            wasShutdown;
6087         bool            reachedStopPoint = false;
6088         bool            haveBackupLabel = false;
6089         XLogRecPtr      RecPtr,
6090                                 checkPointLoc,
6091                                 EndOfLog;
6092         XLogSegNo       endLogSegNo;
6093         TimeLineID      PrevTimeLineID;
6094         XLogRecord *record;
6095         TransactionId oldestActiveXID;
6096         bool            backupEndRequired = false;
6097         bool            backupFromStandby = false;
6098         DBState         dbstate_at_startup;
6099         XLogReaderState *xlogreader;
6100         XLogPageReadPrivate private;
6101         bool            fast_promoted = false;
6102
6103         /*
6104          * Read control file and check XLOG status looks valid.
6105          *
6106          * Note: in most control paths, *ControlFile is already valid and we need
6107          * not do ReadControlFile() here, but might as well do it to be sure.
6108          */
6109         ReadControlFile();
6110
6111         if (ControlFile->state < DB_SHUTDOWNED ||
6112                 ControlFile->state > DB_IN_PRODUCTION ||
6113                 !XRecOffIsValid(ControlFile->checkPoint))
6114                 ereport(FATAL,
6115                                 (errmsg("control file contains invalid data")));
6116
6117         if (ControlFile->state == DB_SHUTDOWNED)
6118         {
6119                 /* This is the expected case, so don't be chatty in standalone mode */
6120                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6121                                 (errmsg("database system was shut down at %s",
6122                                                 str_time(ControlFile->time))));
6123         }
6124         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6125                 ereport(LOG,
6126                                 (errmsg("database system was shut down in recovery at %s",
6127                                                 str_time(ControlFile->time))));
6128         else if (ControlFile->state == DB_SHUTDOWNING)
6129                 ereport(LOG,
6130                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6131                                                 str_time(ControlFile->time))));
6132         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6133                 ereport(LOG,
6134                    (errmsg("database system was interrupted while in recovery at %s",
6135                                    str_time(ControlFile->time)),
6136                         errhint("This probably means that some data is corrupted and"
6137                                         " you will have to use the last backup for recovery.")));
6138         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6139                 ereport(LOG,
6140                                 (errmsg("database system was interrupted while in recovery at log time %s",
6141                                                 str_time(ControlFile->checkPointCopy.time)),
6142                                  errhint("If this has occurred more than once some data might be corrupted"
6143                           " and you might need to choose an earlier recovery target.")));
6144         else if (ControlFile->state == DB_IN_PRODUCTION)
6145                 ereport(LOG,
6146                           (errmsg("database system was interrupted; last known up at %s",
6147                                           str_time(ControlFile->time))));
6148
6149         /* This is just to allow attaching to startup process with a debugger */
6150 #ifdef XLOG_REPLAY_DELAY
6151         if (ControlFile->state != DB_SHUTDOWNED)
6152                 pg_usleep(60000000L);
6153 #endif
6154
6155         /*
6156          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
6157          * someone has performed a copy for PITR, these directories may have been
6158          * excluded and need to be re-created.
6159          */
6160         ValidateXLOGDirectoryStructure();
6161
6162         /*
6163          * Clear out any old relcache cache files.      This is *necessary* if we do
6164          * any WAL replay, since that would probably result in the cache files
6165          * being out of sync with database reality.  In theory we could leave them
6166          * in place if the database had been cleanly shut down, but it seems
6167          * safest to just remove them always and let them be rebuilt during the
6168          * first backend startup.
6169          */
6170         RelationCacheInitFileRemove();
6171
6172         /*
6173          * Initialize on the assumption we want to recover to the latest timeline
6174          * that's active according to pg_control.
6175          */
6176         if (ControlFile->minRecoveryPointTLI >
6177                 ControlFile->checkPointCopy.ThisTimeLineID)
6178                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6179         else
6180                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6181
6182         /*
6183          * Check for recovery control file, and if so set up state for offline
6184          * recovery
6185          */
6186         readRecoveryCommandFile();
6187
6188         /*
6189          * Save archive_cleanup_command in shared memory so that other processes
6190          * can see it.
6191          */
6192         strncpy(XLogCtl->archiveCleanupCommand,
6193                         archiveCleanupCommand ? archiveCleanupCommand : "",
6194                         sizeof(XLogCtl->archiveCleanupCommand));
6195
6196         if (ArchiveRecoveryRequested)
6197         {
6198                 if (StandbyModeRequested)
6199                         ereport(LOG,
6200                                         (errmsg("entering standby mode")));
6201                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6202                         ereport(LOG,
6203                                         (errmsg("starting point-in-time recovery to XID %u",
6204                                                         recoveryTargetXid)));
6205                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6206                         ereport(LOG,
6207                                         (errmsg("starting point-in-time recovery to %s",
6208                                                         timestamptz_to_str(recoveryTargetTime))));
6209                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6210                         ereport(LOG,
6211                                         (errmsg("starting point-in-time recovery to \"%s\"",
6212                                                         recoveryTargetName)));
6213                 else
6214                         ereport(LOG,
6215                                         (errmsg("starting archive recovery")));
6216         }
6217
6218         /*
6219          * Take ownership of the wakeup latch if we're going to sleep during
6220          * recovery.
6221          */
6222         if (StandbyModeRequested)
6223                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6224
6225         /* Set up XLOG reader facility */
6226         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6227         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6228         if (!xlogreader)
6229                 ereport(ERROR,
6230                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6231                                  errmsg("out of memory"),
6232                         errdetail("Failed while allocating an XLog reading processor.")));
6233         xlogreader->system_identifier = ControlFile->system_identifier;
6234
6235         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6236                                                   &backupFromStandby))
6237         {
6238                 /*
6239                  * Archive recovery was requested, and thanks to the backup label
6240                  * file, we know how far we need to replay to reach consistency. Enter
6241                  * archive recovery directly.
6242                  */
6243                 InArchiveRecovery = true;
6244                 if (StandbyModeRequested)
6245                         StandbyMode = true;
6246
6247                 /*
6248                  * When a backup_label file is present, we want to roll forward from
6249                  * the checkpoint it identifies, rather than using pg_control.
6250                  */
6251                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6252                 if (record != NULL)
6253                 {
6254                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6255                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6256                         ereport(DEBUG1,
6257                                         (errmsg("checkpoint record is at %X/%X",
6258                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6259                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6260
6261                         /*
6262                          * Make sure that REDO location exists. This may not be the case
6263                          * if there was a crash during an online backup, which left a
6264                          * backup_label around that references a WAL segment that's
6265                          * already been archived.
6266                          */
6267                         if (checkPoint.redo < checkPointLoc)
6268                         {
6269                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6270                                         ereport(FATAL,
6271                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6272                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6273                         }
6274                 }
6275                 else
6276                 {
6277                         ereport(FATAL,
6278                                         (errmsg("could not locate required checkpoint record"),
6279                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6280                         wasShutdown = false;    /* keep compiler quiet */
6281                 }
6282                 /* set flag to delete it later */
6283                 haveBackupLabel = true;
6284         }
6285         else
6286         {
6287                 /*
6288                  * It's possible that archive recovery was requested, but we don't
6289                  * know how far we need to replay the WAL before we reach consistency.
6290                  * This can happen for example if a base backup is taken from a
6291                  * running server using an atomic filesystem snapshot, without calling
6292                  * pg_start/stop_backup. Or if you just kill a running master server
6293                  * and put it into archive recovery by creating a recovery.conf file.
6294                  *
6295                  * Our strategy in that case is to perform crash recovery first,
6296                  * replaying all the WAL present in pg_xlog, and only enter archive
6297                  * recovery after that.
6298                  *
6299                  * But usually we already know how far we need to replay the WAL (up
6300                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6301                  * end-of-backup record), and we can enter archive recovery directly.
6302                  */
6303                 if (ArchiveRecoveryRequested &&
6304                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6305                          ControlFile->backupEndRequired ||
6306                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6307                          ControlFile->state == DB_SHUTDOWNED))
6308                 {
6309                         InArchiveRecovery = true;
6310                         if (StandbyModeRequested)
6311                                 StandbyMode = true;
6312                 }
6313
6314                 /*
6315                  * Get the last valid checkpoint record.  If the latest one according
6316                  * to pg_control is broken, try the next-to-last one.
6317                  */
6318                 checkPointLoc = ControlFile->checkPoint;
6319                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6320                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6321                 if (record != NULL)
6322                 {
6323                         ereport(DEBUG1,
6324                                         (errmsg("checkpoint record is at %X/%X",
6325                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6326                 }
6327                 else if (StandbyMode)
6328                 {
6329                         /*
6330                          * The last valid checkpoint record required for a streaming
6331                          * recovery exists in neither standby nor the primary.
6332                          */
6333                         ereport(PANIC,
6334                                         (errmsg("could not locate a valid checkpoint record")));
6335                 }
6336                 else
6337                 {
6338                         checkPointLoc = ControlFile->prevCheckPoint;
6339                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6340                         if (record != NULL)
6341                         {
6342                                 ereport(LOG,
6343                                                 (errmsg("using previous checkpoint record at %X/%X",
6344                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6345                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6346                         }
6347                         else
6348                                 ereport(PANIC,
6349                                          (errmsg("could not locate a valid checkpoint record")));
6350                 }
6351                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6352                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6353         }
6354
6355         /*
6356          * If the location of the checkpoint record is not on the expected
6357          * timeline in the history of the requested timeline, we cannot proceed:
6358          * the backup is not part of the history of the requested timeline.
6359          */
6360         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6361                                                                  * record */
6362         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6363                 checkPoint.ThisTimeLineID)
6364         {
6365                 XLogRecPtr      switchpoint;
6366
6367                 /*
6368                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6369                  * not in expectedTLEs at all.
6370                  */
6371                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6372                 ereport(FATAL,
6373                                 (errmsg("requested timeline %u is not a child of this server's history",
6374                                                 recoveryTargetTLI),
6375                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6376                                                    (uint32) (ControlFile->checkPoint >> 32),
6377                                                    (uint32) ControlFile->checkPoint,
6378                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6379                                                    (uint32) (switchpoint >> 32),
6380                                                    (uint32) switchpoint)));
6381         }
6382
6383         /*
6384          * The min recovery point should be part of the requested timeline's
6385          * history, too.
6386          */
6387         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6388           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6389                 ControlFile->minRecoveryPointTLI)
6390                 ereport(FATAL,
6391                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6392                                                 recoveryTargetTLI,
6393                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6394                                                 (uint32) ControlFile->minRecoveryPoint,
6395                                                 ControlFile->minRecoveryPointTLI)));
6396
6397         LastRec = RecPtr = checkPointLoc;
6398
6399         ereport(DEBUG1,
6400                         (errmsg("redo record is at %X/%X; shutdown %s",
6401                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6402                                         wasShutdown ? "TRUE" : "FALSE")));
6403         ereport(DEBUG1,
6404                         (errmsg("next transaction ID: %u/%u; next OID: %u",
6405                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
6406                                         checkPoint.nextOid)));
6407         ereport(DEBUG1,
6408                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6409                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6410         ereport(DEBUG1,
6411                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6412                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
6413         ereport(DEBUG1,
6414                         (errmsg("oldest MultiXactId: %u, in database %u",
6415                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6416         if (!TransactionIdIsNormal(checkPoint.nextXid))
6417                 ereport(PANIC,
6418                                 (errmsg("invalid next transaction ID")));
6419
6420         /* initialize shared memory variables from the checkpoint record */
6421         ShmemVariableCache->nextXid = checkPoint.nextXid;
6422         ShmemVariableCache->nextOid = checkPoint.nextOid;
6423         ShmemVariableCache->oidCount = 0;
6424         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6425         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6426         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
6427         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6428         XLogCtl->ckptXid = checkPoint.nextXid;
6429
6430         /*
6431          * Startup MultiXact.  We need to do this early for two reasons: one
6432          * is that we might try to access multixacts when we do tuple freezing,
6433          * and the other is we need its state initialized because we attempt
6434          * truncation during restartpoints.
6435          */
6436         StartupMultiXact();
6437
6438         /*
6439          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6440          * control file. On recovery, all unlogged relations are blown away, so
6441          * the unlogged LSN counter can be reset too.
6442          */
6443         if (ControlFile->state == DB_SHUTDOWNED)
6444                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6445         else
6446                 XLogCtl->unloggedLSN = 1;
6447
6448         /*
6449          * We must replay WAL entries using the same TimeLineID they were created
6450          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6451          * also xlog_redo()).
6452          */
6453         ThisTimeLineID = checkPoint.ThisTimeLineID;
6454
6455         /*
6456          * Copy any missing timeline history files between 'now' and the recovery
6457          * target timeline from archive to pg_xlog. While we don't need those
6458          * files ourselves - the history file of the recovery target timeline
6459          * covers all the previous timelines in the history too - a cascading
6460          * standby server might be interested in them. Or, if you archive the WAL
6461          * from this server to a different archive than the master, it'd be good
6462          * for all the history files to get archived there after failover, so that
6463          * you can use one of the old timelines as a PITR target. Timeline history
6464          * files are small, so it's better to copy them unnecessarily than not
6465          * copy them and regret later.
6466          */
6467         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6468
6469         lastFullPageWrites = checkPoint.fullPageWrites;
6470
6471         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6472
6473         if (RecPtr < checkPoint.redo)
6474                 ereport(PANIC,
6475                                 (errmsg("invalid redo in checkpoint record")));
6476
6477         /*
6478          * Check whether we need to force recovery from WAL.  If it appears to
6479          * have been a clean shutdown and we did not have a recovery.conf file,
6480          * then assume no recovery needed.
6481          */
6482         if (checkPoint.redo < RecPtr)
6483         {
6484                 if (wasShutdown)
6485                         ereport(PANIC,
6486                                         (errmsg("invalid redo record in shutdown checkpoint")));
6487                 InRecovery = true;
6488         }
6489         else if (ControlFile->state != DB_SHUTDOWNED)
6490                 InRecovery = true;
6491         else if (ArchiveRecoveryRequested)
6492         {
6493                 /* force recovery due to presence of recovery.conf */
6494                 InRecovery = true;
6495         }
6496
6497         /* REDO */
6498         if (InRecovery)
6499         {
6500                 int                     rmid;
6501
6502                 /* use volatile pointer to prevent code rearrangement */
6503                 volatile XLogCtlData *xlogctl = XLogCtl;
6504
6505                 /*
6506                  * Update pg_control to show that we are recovering and to show the
6507                  * selected checkpoint as the place we are starting from. We also mark
6508                  * pg_control with any minimum recovery stop point obtained from a
6509                  * backup history file.
6510                  */
6511                 dbstate_at_startup = ControlFile->state;
6512                 if (InArchiveRecovery)
6513                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6514                 else
6515                 {
6516                         ereport(LOG,
6517                                         (errmsg("database system was not properly shut down; "
6518                                                         "automatic recovery in progress")));
6519                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6520                                 ereport(LOG,
6521                                                 (errmsg("crash recovery starts in timeline %u "
6522                                                                 "and has target timeline %u",
6523                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6524                                                                 recoveryTargetTLI)));
6525                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6526                 }
6527                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6528                 ControlFile->checkPoint = checkPointLoc;
6529                 ControlFile->checkPointCopy = checkPoint;
6530                 if (InArchiveRecovery)
6531                 {
6532                         /* initialize minRecoveryPoint if not set yet */
6533                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6534                         {
6535                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6536                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6537                         }
6538                 }
6539
6540                 /*
6541                  * Set backupStartPoint if we're starting recovery from a base backup.
6542                  *
6543                  * Set backupEndPoint and use minRecoveryPoint as the backup end
6544                  * location if we're starting recovery from a base backup which was
6545                  * taken from the standby. In this case, the database system status in
6546                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
6547                  * means that backup is corrupted, so we cancel recovery.
6548                  */
6549                 if (haveBackupLabel)
6550                 {
6551                         ControlFile->backupStartPoint = checkPoint.redo;
6552                         ControlFile->backupEndRequired = backupEndRequired;
6553
6554                         if (backupFromStandby)
6555                         {
6556                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6557                                         ereport(FATAL,
6558                                                         (errmsg("backup_label contains data inconsistent with control file"),
6559                                                          errhint("This means that the backup is corrupted and you will "
6560                                                            "have to use another backup for recovery.")));
6561                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6562                         }
6563                 }
6564                 ControlFile->time = (pg_time_t) time(NULL);
6565                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6566                 UpdateControlFile();
6567
6568                 /* initialize our local copy of minRecoveryPoint */
6569                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6570                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6571
6572                 /*
6573                  * Reset pgstat data, because it may be invalid after recovery.
6574                  */
6575                 pgstat_reset_all();
6576
6577                 /*
6578                  * If there was a backup label file, it's done its job and the info
6579                  * has now been propagated into pg_control.  We must get rid of the
6580                  * label file so that if we crash during recovery, we'll pick up at
6581                  * the latest recovery restartpoint instead of going all the way back
6582                  * to the backup start point.  It seems prudent though to just rename
6583                  * the file out of the way rather than delete it completely.
6584                  */
6585                 if (haveBackupLabel)
6586                 {
6587                         unlink(BACKUP_LABEL_OLD);
6588                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6589                                 ereport(FATAL,
6590                                                 (errcode_for_file_access(),
6591                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6592                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6593                 }
6594
6595                 /* Check that the GUCs used to generate the WAL allow recovery */
6596                 CheckRequiredParameterValues();
6597
6598                 /*
6599                  * We're in recovery, so unlogged relations may be trashed and must be
6600                  * reset.  This should be done BEFORE allowing Hot Standby
6601                  * connections, so that read-only backends don't try to read whatever
6602                  * garbage is left over from before.
6603                  */
6604                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6605
6606                 /*
6607                  * Likewise, delete any saved transaction snapshot files that got left
6608                  * behind by crashed backends.
6609                  */
6610                 DeleteAllExportedSnapshotFiles();
6611
6612                 /*
6613                  * Initialize for Hot Standby, if enabled. We won't let backends in
6614                  * yet, not until we've reached the min recovery point specified in
6615                  * control file and we've established a recovery snapshot from a
6616                  * running-xacts WAL record.
6617                  */
6618                 if (ArchiveRecoveryRequested && EnableHotStandby)
6619                 {
6620                         TransactionId *xids;
6621                         int                     nxids;
6622
6623                         ereport(DEBUG1,
6624                                         (errmsg("initializing for hot standby")));
6625
6626                         InitRecoveryTransactionEnvironment();
6627
6628                         if (wasShutdown)
6629                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6630                         else
6631                                 oldestActiveXID = checkPoint.oldestActiveXid;
6632                         Assert(TransactionIdIsValid(oldestActiveXID));
6633
6634                         /* Tell procarray about the range of xids it has to deal with */
6635                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6636
6637                         /*
6638                          * Startup commit log and subtrans only. MultiXact has already
6639                          * been started up and other SLRUs are not maintained during
6640                          * recovery and need not be started yet.
6641                          */
6642                         StartupCLOG();
6643                         StartupSUBTRANS(oldestActiveXID);
6644
6645                         /*
6646                          * If we're beginning at a shutdown checkpoint, we know that
6647                          * nothing was running on the master at this point. So fake-up an
6648                          * empty running-xacts record and use that here and now. Recover
6649                          * additional standby state for prepared transactions.
6650                          */
6651                         if (wasShutdown)
6652                         {
6653                                 RunningTransactionsData running;
6654                                 TransactionId latestCompletedXid;
6655
6656                                 /*
6657                                  * Construct a RunningTransactions snapshot representing a
6658                                  * shut down server, with only prepared transactions still
6659                                  * alive. We're never overflowed at this point because all
6660                                  * subxids are listed with their parent prepared transactions.
6661                                  */
6662                                 running.xcnt = nxids;
6663                                 running.subxcnt = 0;
6664                                 running.subxid_overflow = false;
6665                                 running.nextXid = checkPoint.nextXid;
6666                                 running.oldestRunningXid = oldestActiveXID;
6667                                 latestCompletedXid = checkPoint.nextXid;
6668                                 TransactionIdRetreat(latestCompletedXid);
6669                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6670                                 running.latestCompletedXid = latestCompletedXid;
6671                                 running.xids = xids;
6672
6673                                 ProcArrayApplyRecoveryInfo(&running);
6674
6675                                 StandbyRecoverPreparedTransactions(false);
6676                         }
6677                 }
6678
6679                 /* Initialize resource managers */
6680                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6681                 {
6682                         if (RmgrTable[rmid].rm_startup != NULL)
6683                                 RmgrTable[rmid].rm_startup();
6684                 }
6685
6686                 /*
6687                  * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
6688                  * recoveryLastXTime.
6689                  *
6690                  * This is slightly confusing if we're starting from an online
6691                  * checkpoint; we've just read and replayed the checkpoint record, but
6692                  * we're going to start replay from its redo pointer, which precedes
6693                  * the location of the checkpoint record itself. So even though the
6694                  * last record we've replayed is indeed ReadRecPtr, we haven't
6695                  * replayed all the preceding records yet. That's OK for the current
6696                  * use of these variables.
6697                  */
6698                 SpinLockAcquire(&xlogctl->info_lck);
6699                 xlogctl->replayEndRecPtr = ReadRecPtr;
6700                 xlogctl->replayEndTLI = ThisTimeLineID;
6701                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
6702                 xlogctl->lastReplayedTLI = ThisTimeLineID;
6703                 xlogctl->recoveryLastXTime = 0;
6704                 xlogctl->currentChunkStartTime = 0;
6705                 xlogctl->recoveryPause = false;
6706                 SpinLockRelease(&xlogctl->info_lck);
6707
6708                 /* Also ensure XLogReceiptTime has a sane value */
6709                 XLogReceiptTime = GetCurrentTimestamp();
6710
6711                 /*
6712                  * Let postmaster know we've started redo now, so that it can launch
6713                  * checkpointer to perform restartpoints.  We don't bother during
6714                  * crash recovery as restartpoints can only be performed during
6715                  * archive recovery.  And we'd like to keep crash recovery simple, to
6716                  * avoid introducing bugs that could affect you when recovering after
6717                  * crash.
6718                  *
6719                  * After this point, we can no longer assume that we're the only
6720                  * process in addition to postmaster!  Also, fsync requests are
6721                  * subsequently to be handled by the checkpointer, not locally.
6722                  */
6723                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6724                 {
6725                         PublishStartupProcessInformation();
6726                         SetForwardFsyncRequests();
6727                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6728                         bgwriterLaunched = true;
6729                 }
6730
6731                 /*
6732                  * Allow read-only connections immediately if we're consistent
6733                  * already.
6734                  */
6735                 CheckRecoveryConsistency();
6736
6737                 /*
6738                  * Find the first record that logically follows the checkpoint --- it
6739                  * might physically precede it, though.
6740                  */
6741                 if (checkPoint.redo < RecPtr)
6742                 {
6743                         /* back up to find the record */
6744                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6745                 }
6746                 else
6747                 {
6748                         /* just have to read next record after CheckPoint */
6749                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6750                 }
6751
6752                 if (record != NULL)
6753                 {
6754                         bool            recoveryContinue = true;
6755                         bool            recoveryApply = true;
6756                         bool            recoveryDelay = false;
6757                         ErrorContextCallback errcallback;
6758                         TimestampTz xtime;
6759
6760                         InRedo = true;
6761
6762                         ereport(LOG,
6763                                         (errmsg("redo starts at %X/%X",
6764                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6765
6766                         /*
6767                          * main redo apply loop
6768                          */
6769                         do
6770                         {
6771                                 bool            switchedTLI = false;
6772
6773 #ifdef WAL_DEBUG
6774                                 if (XLOG_DEBUG ||
6775                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6776                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6777                                 {
6778                                         StringInfoData buf;
6779
6780                                         initStringInfo(&buf);
6781                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6782                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6783                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6784                                         xlog_outrec(&buf, record);
6785                                         appendStringInfoString(&buf, " - ");
6786                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
6787                                                                                                            record->xl_info,
6788                                                                                                          XLogRecGetData(record));
6789                                         elog(LOG, "%s", buf.data);
6790                                         pfree(buf.data);
6791                                 }
6792 #endif
6793
6794                                 /* Handle interrupt signals of startup process */
6795                                 HandleStartupProcInterrupts();
6796
6797                                 /*
6798                                  * Pause WAL replay, if requested by a hot-standby session via
6799                                  * SetRecoveryPause().
6800                                  *
6801                                  * Note that we intentionally don't take the info_lck spinlock
6802                                  * here.  We might therefore read a slightly stale value of
6803                                  * the recoveryPause flag, but it can't be very stale (no
6804                                  * worse than the last spinlock we did acquire).  Since a
6805                                  * pause request is a pretty asynchronous thing anyway,
6806                                  * possibly responding to it one WAL record later than we
6807                                  * otherwise would is a minor issue, so it doesn't seem worth
6808                                  * adding another spinlock cycle to prevent that.
6809                                  */
6810                                 if (xlogctl->recoveryPause)
6811                                         recoveryPausesHere();
6812
6813                                 /*
6814                                  * Have we reached our recovery target?
6815                                  */
6816                                 if (recoveryStopsHere(record, &recoveryApply, &recoveryDelay))
6817                                 {
6818                                         if (recoveryPauseAtTarget)
6819                                         {
6820                                                 SetRecoveryPause(true);
6821                                                 recoveryPausesHere();
6822                                         }
6823                                         reachedStopPoint = true;        /* see below */
6824                                         recoveryContinue = false;
6825
6826                                         /* Exit loop if we reached non-inclusive recovery target */
6827                                         if (!recoveryApply)
6828                                                 break;
6829                                 }
6830
6831                                 /*
6832                                  * If we've been asked to lag the master, wait on
6833                                  * latch until enough time has passed.
6834                                  */
6835                                 if (recoveryDelay)
6836                                 {
6837                                         recoveryApplyDelay();
6838
6839                                         /*
6840                                          * We test for paused recovery again here. If
6841                                          * user sets delayed apply, it may be because
6842                                          * they expect to pause recovery in case of
6843                                          * problems, so we must test again here otherwise
6844                                          * pausing during the delay-wait wouldn't work.
6845                                          */
6846                                         if (xlogctl->recoveryPause)
6847                                                 recoveryPausesHere();
6848                                 }
6849
6850                                 /* Setup error traceback support for ereport() */
6851                                 errcallback.callback = rm_redo_error_callback;
6852                                 errcallback.arg = (void *) record;
6853                                 errcallback.previous = error_context_stack;
6854                                 error_context_stack = &errcallback;
6855
6856                                 /*
6857                                  * ShmemVariableCache->nextXid must be beyond record's xid.
6858                                  *
6859                                  * We don't expect anyone else to modify nextXid, hence we
6860                                  * don't need to hold a lock while examining it.  We still
6861                                  * acquire the lock to modify it, though.
6862                                  */
6863                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6864                                                                                                  ShmemVariableCache->nextXid))
6865                                 {
6866                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6867                                         ShmemVariableCache->nextXid = record->xl_xid;
6868                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6869                                         LWLockRelease(XidGenLock);
6870                                 }
6871
6872                                 /*
6873                                  * Before replaying this record, check if this record causes
6874                                  * the current timeline to change. The record is already
6875                                  * considered to be part of the new timeline, so we update
6876                                  * ThisTimeLineID before replaying it. That's important so
6877                                  * that replayEndTLI, which is recorded as the minimum
6878                                  * recovery point's TLI if recovery stops after this record,
6879                                  * is set correctly.
6880                                  */
6881                                 if (record->xl_rmid == RM_XLOG_ID)
6882                                 {
6883                                         TimeLineID      newTLI = ThisTimeLineID;
6884                                         TimeLineID      prevTLI = ThisTimeLineID;
6885                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6886
6887                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
6888                                         {
6889                                                 CheckPoint      checkPoint;
6890
6891                                                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6892                                                 newTLI = checkPoint.ThisTimeLineID;
6893                                                 prevTLI = checkPoint.PrevTimeLineID;
6894                                         }
6895                                         else if (info == XLOG_END_OF_RECOVERY)
6896                                         {
6897                                                 xl_end_of_recovery xlrec;
6898
6899                                                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
6900                                                 newTLI = xlrec.ThisTimeLineID;
6901                                                 prevTLI = xlrec.PrevTimeLineID;
6902                                         }
6903
6904                                         if (newTLI != ThisTimeLineID)
6905                                         {
6906                                                 /* Check that it's OK to switch to this TLI */
6907                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
6908
6909                                                 /* Following WAL records should be run with new TLI */
6910                                                 ThisTimeLineID = newTLI;
6911                                                 switchedTLI = true;
6912                                         }
6913                                 }
6914
6915                                 /*
6916                                  * Update shared replayEndRecPtr before replaying this record,
6917                                  * so that XLogFlush will update minRecoveryPoint correctly.
6918                                  */
6919                                 SpinLockAcquire(&xlogctl->info_lck);
6920                                 xlogctl->replayEndRecPtr = EndRecPtr;
6921                                 xlogctl->replayEndTLI = ThisTimeLineID;
6922                                 SpinLockRelease(&xlogctl->info_lck);
6923
6924                                 /*
6925                                  * If we are attempting to enter Hot Standby mode, process
6926                                  * XIDs we see
6927                                  */
6928                                 if (standbyState >= STANDBY_INITIALIZED &&
6929                                         TransactionIdIsValid(record->xl_xid))
6930                                         RecordKnownAssignedTransactionIds(record->xl_xid);
6931
6932                                 /* Now apply the WAL record itself */
6933                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
6934
6935                                 /* Pop the error context stack */
6936                                 error_context_stack = errcallback.previous;
6937
6938                                 /*
6939                                  * Update lastReplayedEndRecPtr after this record has been
6940                                  * successfully replayed.
6941                                  */
6942                                 SpinLockAcquire(&xlogctl->info_lck);
6943                                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
6944                                 xlogctl->lastReplayedTLI = ThisTimeLineID;
6945                                 SpinLockRelease(&xlogctl->info_lck);
6946
6947                                 /* Remember this record as the last-applied one */
6948                                 LastRec = ReadRecPtr;
6949
6950                                 /* Allow read-only connections if we're consistent now */
6951                                 CheckRecoveryConsistency();
6952
6953                                 /*
6954                                  * If this record was a timeline switch, wake up any
6955                                  * walsenders to notice that we are on a new timeline.
6956                                  */
6957                                 if (switchedTLI && AllowCascadeReplication())
6958                                         WalSndWakeup();
6959
6960                                 /* Exit loop if we reached inclusive recovery target */
6961                                 if (!recoveryContinue)
6962                                         break;
6963
6964                                 /* Else, try to fetch the next WAL record */
6965                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6966                         } while (record != NULL);
6967
6968                         /*
6969                          * end of main redo apply loop
6970                          */
6971
6972                         ereport(LOG,
6973                                         (errmsg("redo done at %X/%X",
6974                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6975                         xtime = GetLatestXTime();
6976                         if (xtime)
6977                                 ereport(LOG,
6978                                          (errmsg("last completed transaction was at log time %s",
6979                                                          timestamptz_to_str(xtime))));
6980                         InRedo = false;
6981                 }
6982                 else
6983                 {
6984                         /* there are no WAL records following the checkpoint */
6985                         ereport(LOG,
6986                                         (errmsg("redo is not required")));
6987                 }
6988         }
6989
6990         /*
6991          * Kill WAL receiver, if it's still running, before we continue to write
6992          * the startup checkpoint record. It will trump over the checkpoint and
6993          * subsequent records if it's still alive when we start writing WAL.
6994          */
6995         ShutdownWalRcv();
6996
6997         /*
6998          * We don't need the latch anymore. It's not strictly necessary to disown
6999          * it, but let's do it for the sake of tidiness.
7000          */
7001         if (StandbyModeRequested)
7002                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7003
7004         /*
7005          * We are now done reading the xlog from stream. Turn off streaming
7006          * recovery to force fetching the files (which would be required at end of
7007          * recovery, e.g., timeline history file) from archive or pg_xlog.
7008          */
7009         StandbyMode = false;
7010
7011         /*
7012          * Re-fetch the last valid or last applied record, so we can identify the
7013          * exact endpoint of what we consider the valid portion of WAL.
7014          */
7015         record = ReadRecord(xlogreader, LastRec, PANIC, false);
7016         EndOfLog = EndRecPtr;
7017         XLByteToPrevSeg(EndOfLog, endLogSegNo);
7018
7019         /*
7020          * Complain if we did not roll forward far enough to render the backup
7021          * dump consistent.  Note: it is indeed okay to look at the local variable
7022          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7023          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7024          * advanced beyond the WAL we processed.
7025          */
7026         if (InRecovery &&
7027                 (EndOfLog < minRecoveryPoint ||
7028                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7029         {
7030                 if (reachedStopPoint)
7031                 {
7032                         /* stopped because of stop request */
7033                         ereport(FATAL,
7034                                         (errmsg("requested recovery stop point is before consistent recovery point")));
7035                 }
7036
7037                 /*
7038                  * Ran off end of WAL before reaching end-of-backup WAL record, or
7039                  * minRecoveryPoint. That's usually a bad sign, indicating that you
7040                  * tried to recover from an online backup but never called
7041                  * pg_stop_backup(), or you didn't archive all the WAL up to that
7042                  * point. However, this also happens in crash recovery, if the system
7043                  * crashes while an online backup is in progress. We must not treat
7044                  * that as an error, or the database will refuse to start up.
7045                  */
7046                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7047                 {
7048                         if (ControlFile->backupEndRequired)
7049                                 ereport(FATAL,
7050                                                 (errmsg("WAL ends before end of online backup"),
7051                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
7052                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7053                                 ereport(FATAL,
7054                                                 (errmsg("WAL ends before end of online backup"),
7055                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7056                         else
7057                                 ereport(FATAL,
7058                                           (errmsg("WAL ends before consistent recovery point")));
7059                 }
7060         }
7061
7062         /*
7063          * Consider whether we need to assign a new timeline ID.
7064          *
7065          * If we are doing an archive recovery, we always assign a new ID.      This
7066          * handles a couple of issues.  If we stopped short of the end of WAL
7067          * during recovery, then we are clearly generating a new timeline and must
7068          * assign it a unique new ID.  Even if we ran to the end, modifying the
7069          * current last segment is problematic because it may result in trying to
7070          * overwrite an already-archived copy of that segment, and we encourage
7071          * DBAs to make their archive_commands reject that.  We can dodge the
7072          * problem by making the new active segment have a new timeline ID.
7073          *
7074          * In a normal crash recovery, we can just extend the timeline we were in.
7075          */
7076         PrevTimeLineID = ThisTimeLineID;
7077         if (ArchiveRecoveryRequested)
7078         {
7079                 char            reason[200];
7080
7081                 Assert(InArchiveRecovery);
7082
7083                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7084                 ereport(LOG,
7085                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7086
7087                 /*
7088                  * Create a comment for the history file to explain why and where
7089                  * timeline changed.
7090                  */
7091                 if (recoveryTarget == RECOVERY_TARGET_XID)
7092                         snprintf(reason, sizeof(reason),
7093                                          "%s transaction %u",
7094                                          recoveryStopAfter ? "after" : "before",
7095                                          recoveryStopXid);
7096                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7097                         snprintf(reason, sizeof(reason),
7098                                          "%s %s\n",
7099                                          recoveryStopAfter ? "after" : "before",
7100                                          timestamptz_to_str(recoveryStopTime));
7101                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7102                         snprintf(reason, sizeof(reason),
7103                                          "at restore point \"%s\"",
7104                                          recoveryStopName);
7105                 else
7106                         snprintf(reason, sizeof(reason), "no recovery target specified");
7107
7108                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7109                                                          EndRecPtr, reason);
7110         }
7111
7112         /* Save the selected TimeLineID in shared memory, too */
7113         XLogCtl->ThisTimeLineID = ThisTimeLineID;
7114         XLogCtl->PrevTimeLineID = PrevTimeLineID;
7115
7116         /*
7117          * We are now done reading the old WAL.  Turn off archive fetching if it
7118          * was active, and make a writable copy of the last WAL segment. (Note
7119          * that we also have a copy of the last block of the old WAL in readBuf;
7120          * we will use that below.)
7121          */
7122         if (ArchiveRecoveryRequested)
7123                 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
7124
7125         /*
7126          * Prepare to write WAL starting at EndOfLog position, and init xlog
7127          * buffer cache using the block containing the last record from the
7128          * previous incarnation.
7129          */
7130         openLogSegNo = endLogSegNo;
7131         openLogFile = XLogFileOpen(openLogSegNo);
7132         openLogOff = 0;
7133         Insert = &XLogCtl->Insert;
7134         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7135         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7136
7137         /*
7138          * Tricky point here: readBuf contains the *last* block that the LastRec
7139          * record spans, not the one it starts in.      The last block is indeed the
7140          * one we want to use.
7141          */
7142         if (EndOfLog % XLOG_BLCKSZ != 0)
7143         {
7144                 char       *page;
7145                 int                     len;
7146                 int                     firstIdx;
7147                 XLogRecPtr      pageBeginPtr;
7148
7149                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7150                 Assert(readOff == pageBeginPtr % XLogSegSize);
7151
7152                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7153
7154                 /* Copy the valid part of the last block, and zero the rest */
7155                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7156                 len = EndOfLog % XLOG_BLCKSZ;
7157                 memcpy(page, xlogreader->readBuf, len);
7158                 memset(page + len, 0, XLOG_BLCKSZ - len);
7159
7160                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7161                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7162         }
7163         else
7164         {
7165                 /*
7166                  * There is no partial block to copy. Just set InitializedUpTo,
7167                  * and let the first attempt to insert a log record to initialize
7168                  * the next buffer.
7169                  */
7170                 XLogCtl->InitializedUpTo = EndOfLog;
7171         }
7172
7173         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7174
7175         XLogCtl->LogwrtResult = LogwrtResult;
7176
7177         XLogCtl->LogwrtRqst.Write = EndOfLog;
7178         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7179
7180         /* Pre-scan prepared transactions to find out the range of XIDs present */
7181         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7182
7183         /*
7184          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7185          * record before resource manager writes cleanup WAL records or checkpoint
7186          * record is written.
7187          */
7188         Insert->fullPageWrites = lastFullPageWrites;
7189         LocalSetXLogInsertAllowed();
7190         UpdateFullPageWrites();
7191         LocalXLogInsertAllowed = -1;
7192
7193         if (InRecovery)
7194         {
7195                 int                     rmid;
7196
7197                 /*
7198                  * Resource managers might need to write WAL records, eg, to record
7199                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
7200                  * this process only.
7201                  */
7202                 LocalSetXLogInsertAllowed();
7203
7204                 /*
7205                  * Allow resource managers to do any required cleanup.
7206                  */
7207                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7208                 {
7209                         if (RmgrTable[rmid].rm_cleanup != NULL)
7210                                 RmgrTable[rmid].rm_cleanup();
7211                 }
7212
7213                 /* Disallow XLogInsert again */
7214                 LocalXLogInsertAllowed = -1;
7215
7216                 /*
7217                  * Perform a checkpoint to update all our recovery activity to disk.
7218                  *
7219                  * Note that we write a shutdown checkpoint rather than an on-line
7220                  * one. This is not particularly critical, but since we may be
7221                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7222                  * the rule that TLI only changes in shutdown checkpoints, which
7223                  * allows some extra error checking in xlog_redo.
7224                  *
7225                  * In fast promotion, only create a lightweight end-of-recovery record
7226                  * instead of a full checkpoint. A checkpoint is requested later,
7227                  * after we're fully out of recovery mode and already accepting
7228                  * queries.
7229                  */
7230                 if (bgwriterLaunched)
7231                 {
7232                         if (fast_promote)
7233                         {
7234                                 checkPointLoc = ControlFile->prevCheckPoint;
7235
7236                                 /*
7237                                  * Confirm the last checkpoint is available for us to recover
7238                                  * from if we fail. Note that we don't check for the secondary
7239                                  * checkpoint since that isn't available in most base backups.
7240                                  */
7241                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7242                                 if (record != NULL)
7243                                 {
7244                                         fast_promoted = true;
7245
7246                                         /*
7247                                          * Insert a special WAL record to mark the end of
7248                                          * recovery, since we aren't doing a checkpoint. That
7249                                          * means that the checkpointer process may likely be in
7250                                          * the middle of a time-smoothed restartpoint and could
7251                                          * continue to be for minutes after this. That sounds
7252                                          * strange, but the effect is roughly the same and it
7253                                          * would be stranger to try to come out of the
7254                                          * restartpoint and then checkpoint. We request a
7255                                          * checkpoint later anyway, just for safety.
7256                                          */
7257                                         CreateEndOfRecoveryRecord();
7258                                 }
7259                         }
7260
7261                         if (!fast_promoted)
7262                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7263                                                                   CHECKPOINT_IMMEDIATE |
7264                                                                   CHECKPOINT_WAIT);
7265                 }
7266                 else
7267                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7268
7269                 /*
7270                  * And finally, execute the recovery_end_command, if any.
7271                  */
7272                 if (recoveryEndCommand)
7273                         ExecuteRecoveryCommand(recoveryEndCommand,
7274                                                                    "recovery_end_command",
7275                                                                    true);
7276         }
7277
7278         /*
7279          * Preallocate additional log files, if wanted.
7280          */
7281         PreallocXlogFiles(EndOfLog);
7282
7283         /*
7284          * Reset initial contents of unlogged relations.  This has to be done
7285          * AFTER recovery is complete so that any unlogged relations created
7286          * during recovery also get picked up.
7287          */
7288         if (InRecovery)
7289                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7290
7291         /*
7292          * Okay, we're officially UP.
7293          */
7294         InRecovery = false;
7295
7296         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7297         ControlFile->state = DB_IN_PRODUCTION;
7298         ControlFile->time = (pg_time_t) time(NULL);
7299         UpdateControlFile();
7300         LWLockRelease(ControlFileLock);
7301
7302         /* start the archive_timeout timer running */
7303         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7304
7305         /* also initialize latestCompletedXid, to nextXid - 1 */
7306         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7307         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7308         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7309         LWLockRelease(ProcArrayLock);
7310
7311         /*
7312          * Start up the commit log and subtrans, if not already done for hot
7313          * standby.
7314          */
7315         if (standbyState == STANDBY_DISABLED)
7316         {
7317                 StartupCLOG();
7318                 StartupSUBTRANS(oldestActiveXID);
7319         }
7320
7321         /*
7322          * Perform end of recovery actions for any SLRUs that need it.
7323          */
7324         TrimCLOG();
7325         TrimMultiXact();
7326
7327         /* Reload shared-memory state for prepared transactions */
7328         RecoverPreparedTransactions();
7329
7330         /*
7331          * Shutdown the recovery environment. This must occur after
7332          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7333          */
7334         if (standbyState != STANDBY_DISABLED)
7335                 ShutdownRecoveryTransactionEnvironment();
7336
7337         /* Shut down xlogreader */
7338         if (readFile >= 0)
7339         {
7340                 close(readFile);
7341                 readFile = -1;
7342         }
7343         XLogReaderFree(xlogreader);
7344
7345         /*
7346          * If any of the critical GUCs have changed, log them before we allow
7347          * backends to write WAL.
7348          */
7349         LocalSetXLogInsertAllowed();
7350         XLogReportParameters();
7351
7352         /*
7353          * All done.  Allow backends to write WAL.      (Although the bool flag is
7354          * probably atomic in itself, we use the info_lck here to ensure that
7355          * there are no race conditions concerning visibility of other recent
7356          * updates to shared memory.)
7357          */
7358         {
7359                 /* use volatile pointer to prevent code rearrangement */
7360                 volatile XLogCtlData *xlogctl = XLogCtl;
7361
7362                 SpinLockAcquire(&xlogctl->info_lck);
7363                 xlogctl->SharedRecoveryInProgress = false;
7364                 SpinLockRelease(&xlogctl->info_lck);
7365         }
7366
7367         /*
7368          * If there were cascading standby servers connected to us, nudge any wal
7369          * sender processes to notice that we've been promoted.
7370          */
7371         WalSndWakeup();
7372
7373         /*
7374          * If this was a fast promotion, request an (online) checkpoint now. This
7375          * isn't required for consistency, but the last restartpoint might be far
7376          * back, and in case of a crash, recovering from it might take a longer
7377          * than is appropriate now that we're not in standby mode anymore.
7378          */
7379         if (fast_promoted)
7380                 RequestCheckpoint(CHECKPOINT_FORCE);
7381 }
7382
7383 /*
7384  * Checks if recovery has reached a consistent state. When consistency is
7385  * reached and we have a valid starting standby snapshot, tell postmaster
7386  * that it can start accepting read-only connections.
7387  */
7388 static void
7389 CheckRecoveryConsistency(void)
7390 {
7391         /*
7392          * During crash recovery, we don't reach a consistent state until we've
7393          * replayed all the WAL.
7394          */
7395         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7396                 return;
7397
7398         /*
7399          * Have we reached the point where our base backup was completed?
7400          */
7401         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7402                 ControlFile->backupEndPoint <= EndRecPtr)
7403         {
7404                 /*
7405                  * We have reached the end of base backup, as indicated by pg_control.
7406                  * The data on disk is now consistent. Reset backupStartPoint and
7407                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7408                  * allow starting up at an earlier point even if recovery is stopped
7409                  * and restarted soon after this.
7410                  */
7411                 elog(DEBUG1, "end of backup reached");
7412
7413                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7414
7415                 if (ControlFile->minRecoveryPoint < EndRecPtr)
7416                         ControlFile->minRecoveryPoint = EndRecPtr;
7417
7418                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7419                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7420                 ControlFile->backupEndRequired = false;
7421                 UpdateControlFile();
7422
7423                 LWLockRelease(ControlFileLock);
7424         }
7425
7426         /*
7427          * Have we passed our safe starting point? Note that minRecoveryPoint is
7428          * known to be incorrectly set if ControlFile->backupEndRequired, until
7429          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7430          * minRecoveryPoint. All we know prior to that is that we're not
7431          * consistent yet.
7432          */
7433         if (!reachedConsistency && !ControlFile->backupEndRequired &&
7434                 minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr &&
7435                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7436         {
7437                 /*
7438                  * Check to see if the XLOG sequence contained any unresolved
7439                  * references to uninitialized pages.
7440                  */
7441                 XLogCheckInvalidPages();
7442
7443                 reachedConsistency = true;
7444                 ereport(LOG,
7445                                 (errmsg("consistent recovery state reached at %X/%X",
7446                                                 (uint32) (XLogCtl->lastReplayedEndRecPtr >> 32),
7447                                                 (uint32) XLogCtl->lastReplayedEndRecPtr)));
7448         }
7449
7450         /*
7451          * Have we got a valid starting snapshot that will allow queries to be
7452          * run? If so, we can tell postmaster that the database is consistent now,
7453          * enabling connections.
7454          */
7455         if (standbyState == STANDBY_SNAPSHOT_READY &&
7456                 !LocalHotStandbyActive &&
7457                 reachedConsistency &&
7458                 IsUnderPostmaster)
7459         {
7460                 /* use volatile pointer to prevent code rearrangement */
7461                 volatile XLogCtlData *xlogctl = XLogCtl;
7462
7463                 SpinLockAcquire(&xlogctl->info_lck);
7464                 xlogctl->SharedHotStandbyActive = true;
7465                 SpinLockRelease(&xlogctl->info_lck);
7466
7467                 LocalHotStandbyActive = true;
7468
7469                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7470         }
7471 }
7472
7473 /*
7474  * Is the system still in recovery?
7475  *
7476  * Unlike testing InRecovery, this works in any process that's connected to
7477  * shared memory.
7478  *
7479  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7480  * variables the first time we see that recovery is finished.
7481  */
7482 bool
7483 RecoveryInProgress(void)
7484 {
7485         /*
7486          * We check shared state each time only until we leave recovery mode. We
7487          * can't re-enter recovery, so there's no need to keep checking after the
7488          * shared variable has once been seen false.
7489          */
7490         if (!LocalRecoveryInProgress)
7491                 return false;
7492         else
7493         {
7494                 /*
7495                  * use volatile pointer to make sure we make a fresh read of the
7496                  * shared variable.
7497                  */
7498                 volatile XLogCtlData *xlogctl = XLogCtl;
7499
7500                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7501
7502                 /*
7503                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7504                  * is finished. InitPostgres() relies upon this behaviour to ensure
7505                  * that InitXLOGAccess() is called at backend startup.  (If you change
7506                  * this, see also LocalSetXLogInsertAllowed.)
7507                  */
7508                 if (!LocalRecoveryInProgress)
7509                 {
7510                         /*
7511                          * If we just exited recovery, make sure we read TimeLineID and
7512                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
7513                          * weak memory ordering).
7514                          */
7515                         pg_memory_barrier();
7516                         InitXLOGAccess();
7517                 }
7518                 /*
7519                  * Note: We don't need a memory barrier when we're still in recovery.
7520                  * We might exit recovery immediately after return, so the caller
7521                  * can't rely on 'true' meaning that we're still in recovery anyway.
7522                  */
7523
7524                 return LocalRecoveryInProgress;
7525         }
7526 }
7527
7528 /*
7529  * Is HotStandby active yet? This is only important in special backends
7530  * since normal backends won't ever be able to connect until this returns
7531  * true. Postmaster knows this by way of signal, not via shared memory.
7532  *
7533  * Unlike testing standbyState, this works in any process that's connected to
7534  * shared memory.
7535  */
7536 bool
7537 HotStandbyActive(void)
7538 {
7539         /*
7540          * We check shared state each time only until Hot Standby is active. We
7541          * can't de-activate Hot Standby, so there's no need to keep checking
7542          * after the shared variable has once been seen true.
7543          */
7544         if (LocalHotStandbyActive)
7545                 return true;
7546         else
7547         {
7548                 /* use volatile pointer to prevent code rearrangement */
7549                 volatile XLogCtlData *xlogctl = XLogCtl;
7550
7551                 /* spinlock is essential on machines with weak memory ordering! */
7552                 SpinLockAcquire(&xlogctl->info_lck);
7553                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
7554                 SpinLockRelease(&xlogctl->info_lck);
7555
7556                 return LocalHotStandbyActive;
7557         }
7558 }
7559
7560 /*
7561  * Is this process allowed to insert new WAL records?
7562  *
7563  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7564  * But we also have provisions for forcing the result "true" or "false"
7565  * within specific processes regardless of the global state.
7566  */
7567 bool
7568 XLogInsertAllowed(void)
7569 {
7570         /*
7571          * If value is "unconditionally true" or "unconditionally false", just
7572          * return it.  This provides the normal fast path once recovery is known
7573          * done.
7574          */
7575         if (LocalXLogInsertAllowed >= 0)
7576                 return (bool) LocalXLogInsertAllowed;
7577
7578         /*
7579          * Else, must check to see if we're still in recovery.
7580          */
7581         if (RecoveryInProgress())
7582                 return false;
7583
7584         /*
7585          * On exit from recovery, reset to "unconditionally true", since there is
7586          * no need to keep checking.
7587          */
7588         LocalXLogInsertAllowed = 1;
7589         return true;
7590 }
7591
7592 /*
7593  * Make XLogInsertAllowed() return true in the current process only.
7594  *
7595  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7596  * and even call LocalSetXLogInsertAllowed() again after that.
7597  */
7598 static void
7599 LocalSetXLogInsertAllowed(void)
7600 {
7601         Assert(LocalXLogInsertAllowed == -1);
7602         LocalXLogInsertAllowed = 1;
7603
7604         /* Initialize as RecoveryInProgress() would do when switching state */
7605         InitXLOGAccess();
7606 }
7607
7608 /*
7609  * Subroutine to try to fetch and validate a prior checkpoint record.
7610  *
7611  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7612  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7613  */
7614 static XLogRecord *
7615 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7616                                          int whichChkpt, bool report)
7617 {
7618         XLogRecord *record;
7619
7620         if (!XRecOffIsValid(RecPtr))
7621         {
7622                 if (!report)
7623                         return NULL;
7624
7625                 switch (whichChkpt)
7626                 {
7627                         case 1:
7628                                 ereport(LOG,
7629                                 (errmsg("invalid primary checkpoint link in control file")));
7630                                 break;
7631                         case 2:
7632                                 ereport(LOG,
7633                                                 (errmsg("invalid secondary checkpoint link in control file")));
7634                                 break;
7635                         default:
7636                                 ereport(LOG,
7637                                    (errmsg("invalid checkpoint link in backup_label file")));
7638                                 break;
7639                 }
7640                 return NULL;
7641         }
7642
7643         record = ReadRecord(xlogreader, RecPtr, LOG, true);
7644
7645         if (record == NULL)
7646         {
7647                 if (!report)
7648                         return NULL;
7649
7650                 switch (whichChkpt)
7651                 {
7652                         case 1:
7653                                 ereport(LOG,
7654                                                 (errmsg("invalid primary checkpoint record")));
7655                                 break;
7656                         case 2:
7657                                 ereport(LOG,
7658                                                 (errmsg("invalid secondary checkpoint record")));
7659                                 break;
7660                         default:
7661                                 ereport(LOG,
7662                                                 (errmsg("invalid checkpoint record")));
7663                                 break;
7664                 }
7665                 return NULL;
7666         }
7667         if (record->xl_rmid != RM_XLOG_ID)
7668         {
7669                 switch (whichChkpt)
7670                 {
7671                         case 1:
7672                                 ereport(LOG,
7673                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7674                                 break;
7675                         case 2:
7676                                 ereport(LOG,
7677                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7678                                 break;
7679                         default:
7680                                 ereport(LOG,
7681                                 (errmsg("invalid resource manager ID in checkpoint record")));
7682                                 break;
7683                 }
7684                 return NULL;
7685         }
7686         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7687                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7688         {
7689                 switch (whichChkpt)
7690                 {
7691                         case 1:
7692                                 ereport(LOG,
7693                                    (errmsg("invalid xl_info in primary checkpoint record")));
7694                                 break;
7695                         case 2:
7696                                 ereport(LOG,
7697                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7698                                 break;
7699                         default:
7700                                 ereport(LOG,
7701                                                 (errmsg("invalid xl_info in checkpoint record")));
7702                                 break;
7703                 }
7704                 return NULL;
7705         }
7706         if (record->xl_len != sizeof(CheckPoint) ||
7707                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
7708         {
7709                 switch (whichChkpt)
7710                 {
7711                         case 1:
7712                                 ereport(LOG,
7713                                         (errmsg("invalid length of primary checkpoint record")));
7714                                 break;
7715                         case 2:
7716                                 ereport(LOG,
7717                                   (errmsg("invalid length of secondary checkpoint record")));
7718                                 break;
7719                         default:
7720                                 ereport(LOG,
7721                                                 (errmsg("invalid length of checkpoint record")));
7722                                 break;
7723                 }
7724                 return NULL;
7725         }
7726         return record;
7727 }
7728
7729 /*
7730  * This must be called during startup of a backend process, except that
7731  * it need not be called in a standalone backend (which does StartupXLOG
7732  * instead).  We need to initialize the local copies of ThisTimeLineID and
7733  * RedoRecPtr.
7734  *
7735  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7736  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7737  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7738  */
7739 void
7740 InitXLOGAccess(void)
7741 {
7742         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7743         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7744         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7745
7746         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7747         (void) GetRedoRecPtr();
7748 }
7749
7750 /*
7751  * Return the current Redo pointer from shared memory.
7752  *
7753  * As a side-effect, the local RedoRecPtr copy is updated.
7754  */
7755 XLogRecPtr
7756 GetRedoRecPtr(void)
7757 {
7758         /* use volatile pointer to prevent code rearrangement */
7759         volatile XLogCtlData *xlogctl = XLogCtl;
7760         XLogRecPtr ptr;
7761
7762         /*
7763          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
7764          * grabbed a WAL insertion slot to read the master copy, someone might
7765          * update it just after we've released the lock.
7766          */
7767         SpinLockAcquire(&xlogctl->info_lck);
7768         ptr = xlogctl->RedoRecPtr;
7769         SpinLockRelease(&xlogctl->info_lck);
7770
7771         if (RedoRecPtr < ptr)
7772                 RedoRecPtr = ptr;
7773
7774         return RedoRecPtr;
7775 }
7776
7777 /*
7778  * GetInsertRecPtr -- Returns the current insert position.
7779  *
7780  * NOTE: The value *actually* returned is the position of the last full
7781  * xlog page. It lags behind the real insert position by at most 1 page.
7782  * For that, we don't need to scan through WAL insertion slots, and an
7783  * approximation is enough for the current usage of this function.
7784  */
7785 XLogRecPtr
7786 GetInsertRecPtr(void)
7787 {
7788         /* use volatile pointer to prevent code rearrangement */
7789         volatile XLogCtlData *xlogctl = XLogCtl;
7790         XLogRecPtr      recptr;
7791
7792         SpinLockAcquire(&xlogctl->info_lck);
7793         recptr = xlogctl->LogwrtRqst.Write;
7794         SpinLockRelease(&xlogctl->info_lck);
7795
7796         return recptr;
7797 }
7798
7799 /*
7800  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7801  * position known to be fsync'd to disk.
7802  */
7803 XLogRecPtr
7804 GetFlushRecPtr(void)
7805 {
7806         /* use volatile pointer to prevent code rearrangement */
7807         volatile XLogCtlData *xlogctl = XLogCtl;
7808         XLogRecPtr      recptr;
7809
7810         SpinLockAcquire(&xlogctl->info_lck);
7811         recptr = xlogctl->LogwrtResult.Flush;
7812         SpinLockRelease(&xlogctl->info_lck);
7813
7814         return recptr;
7815 }
7816
7817 /*
7818  * Get the time of the last xlog segment switch
7819  */
7820 pg_time_t
7821 GetLastSegSwitchTime(void)
7822 {
7823         pg_time_t       result;
7824
7825         /* Need WALWriteLock, but shared lock is sufficient */
7826         LWLockAcquire(WALWriteLock, LW_SHARED);
7827         result = XLogCtl->lastSegSwitchTime;
7828         LWLockRelease(WALWriteLock);
7829
7830         return result;
7831 }
7832
7833 /*
7834  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7835  *
7836  * This is exported for use by code that would like to have 64-bit XIDs.
7837  * We don't really support such things, but all XIDs within the system
7838  * can be presumed "close to" the result, and thus the epoch associated
7839  * with them can be determined.
7840  */
7841 void
7842 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7843 {
7844         uint32          ckptXidEpoch;
7845         TransactionId ckptXid;
7846         TransactionId nextXid;
7847
7848         /* Must read checkpoint info first, else have race condition */
7849         {
7850                 /* use volatile pointer to prevent code rearrangement */
7851                 volatile XLogCtlData *xlogctl = XLogCtl;
7852
7853                 SpinLockAcquire(&xlogctl->info_lck);
7854                 ckptXidEpoch = xlogctl->ckptXidEpoch;
7855                 ckptXid = xlogctl->ckptXid;
7856                 SpinLockRelease(&xlogctl->info_lck);
7857         }
7858
7859         /* Now fetch current nextXid */
7860         nextXid = ReadNewTransactionId();
7861
7862         /*
7863          * nextXid is certainly logically later than ckptXid.  So if it's
7864          * numerically less, it must have wrapped into the next epoch.
7865          */
7866         if (nextXid < ckptXid)
7867                 ckptXidEpoch++;
7868
7869         *xid = nextXid;
7870         *epoch = ckptXidEpoch;
7871 }
7872
7873 /*
7874  * This must be called ONCE during postmaster or standalone-backend shutdown
7875  */
7876 void
7877 ShutdownXLOG(int code, Datum arg)
7878 {
7879         /* Don't be chatty in standalone mode */
7880         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7881                         (errmsg("shutting down")));
7882
7883         if (RecoveryInProgress())
7884                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7885         else
7886         {
7887                 /*
7888                  * If archiving is enabled, rotate the last XLOG file so that all the
7889                  * remaining records are archived (postmaster wakes up the archiver
7890                  * process one more time at the end of shutdown). The checkpoint
7891                  * record will go to the next XLOG file and won't be archived (yet).
7892                  */
7893                 if (XLogArchivingActive() && XLogArchiveCommandSet())
7894                         RequestXLogSwitch();
7895
7896                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7897         }
7898         ShutdownCLOG();
7899         ShutdownSUBTRANS();
7900         ShutdownMultiXact();
7901
7902         /* Don't be chatty in standalone mode */
7903         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7904                         (errmsg("database system is shut down")));
7905 }
7906
7907 /*
7908  * Log start of a checkpoint.
7909  */
7910 static void
7911 LogCheckpointStart(int flags, bool restartpoint)
7912 {
7913         const char *msg;
7914
7915         /*
7916          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
7917          * the main message, but what about all the flags?
7918          */
7919         if (restartpoint)
7920                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
7921         else
7922                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
7923
7924         elog(LOG, msg,
7925                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7926                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7927                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
7928                  (flags & CHECKPOINT_FORCE) ? " force" : "",
7929                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
7930                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
7931                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
7932 }
7933
7934 /*
7935  * Log end of a checkpoint.
7936  */
7937 static void
7938 LogCheckpointEnd(bool restartpoint)
7939 {
7940         long            write_secs,
7941                                 sync_secs,
7942                                 total_secs,
7943                                 longest_secs,
7944                                 average_secs;
7945         int                     write_usecs,
7946                                 sync_usecs,
7947                                 total_usecs,
7948                                 longest_usecs,
7949                                 average_usecs;
7950         uint64          average_sync_time;
7951
7952         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
7953
7954         TimestampDifference(CheckpointStats.ckpt_write_t,
7955                                                 CheckpointStats.ckpt_sync_t,
7956                                                 &write_secs, &write_usecs);
7957
7958         TimestampDifference(CheckpointStats.ckpt_sync_t,
7959                                                 CheckpointStats.ckpt_sync_end_t,
7960                                                 &sync_secs, &sync_usecs);
7961
7962         /* Accumulate checkpoint timing summary data, in milliseconds. */
7963         BgWriterStats.m_checkpoint_write_time +=
7964                 write_secs * 1000 + write_usecs / 1000;
7965         BgWriterStats.m_checkpoint_sync_time +=
7966                 sync_secs * 1000 + sync_usecs / 1000;
7967
7968         /*
7969          * All of the published timing statistics are accounted for.  Only
7970          * continue if a log message is to be written.
7971          */
7972         if (!log_checkpoints)
7973                 return;
7974
7975         TimestampDifference(CheckpointStats.ckpt_start_t,
7976                                                 CheckpointStats.ckpt_end_t,
7977                                                 &total_secs, &total_usecs);
7978
7979         /*
7980          * Timing values returned from CheckpointStats are in microseconds.
7981          * Convert to the second plus microsecond form that TimestampDifference
7982          * returns for homogeneous printing.
7983          */
7984         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
7985         longest_usecs = CheckpointStats.ckpt_longest_sync -
7986                 (uint64) longest_secs *1000000;
7987
7988         average_sync_time = 0;
7989         if (CheckpointStats.ckpt_sync_rels > 0)
7990                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
7991                         CheckpointStats.ckpt_sync_rels;
7992         average_secs = (long) (average_sync_time / 1000000);
7993         average_usecs = average_sync_time - (uint64) average_secs *1000000;
7994
7995         if (restartpoint)
7996                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
7997                          "%d transaction log file(s) added, %d removed, %d recycled; "
7998                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7999                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
8000                          CheckpointStats.ckpt_bufs_written,
8001                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8002                          CheckpointStats.ckpt_segs_added,
8003                          CheckpointStats.ckpt_segs_removed,
8004                          CheckpointStats.ckpt_segs_recycled,
8005                          write_secs, write_usecs / 1000,
8006                          sync_secs, sync_usecs / 1000,
8007                          total_secs, total_usecs / 1000,
8008                          CheckpointStats.ckpt_sync_rels,
8009                          longest_secs, longest_usecs / 1000,
8010                          average_secs, average_usecs / 1000);
8011         else
8012                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
8013                          "%d transaction log file(s) added, %d removed, %d recycled; "
8014                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8015                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
8016                          CheckpointStats.ckpt_bufs_written,
8017                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8018                          CheckpointStats.ckpt_segs_added,
8019                          CheckpointStats.ckpt_segs_removed,
8020                          CheckpointStats.ckpt_segs_recycled,
8021                          write_secs, write_usecs / 1000,
8022                          sync_secs, sync_usecs / 1000,
8023                          total_secs, total_usecs / 1000,
8024                          CheckpointStats.ckpt_sync_rels,
8025                          longest_secs, longest_usecs / 1000,
8026                          average_secs, average_usecs / 1000);
8027 }
8028
8029 /*
8030  * Perform a checkpoint --- either during shutdown, or on-the-fly
8031  *
8032  * flags is a bitwise OR of the following:
8033  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8034  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8035  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8036  *              ignoring checkpoint_completion_target parameter.
8037  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8038  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8039  *              CHECKPOINT_END_OF_RECOVERY).
8040  *
8041  * Note: flags contains other bits, of interest here only for logging purposes.
8042  * In particular note that this routine is synchronous and does not pay
8043  * attention to CHECKPOINT_WAIT.
8044  *
8045  * If !shutdown then we are writing an online checkpoint. This is a very special
8046  * kind of operation and WAL record because the checkpoint action occurs over
8047  * a period of time yet logically occurs at just a single LSN. The logical
8048  * position of the WAL record (redo ptr) is the same or earlier than the
8049  * physical position. When we replay WAL we locate the checkpoint via its
8050  * physical position then read the redo ptr and actually start replay at the
8051  * earlier logical position. Note that we don't write *anything* to WAL at
8052  * the logical position, so that location could be any other kind of WAL record.
8053  * All of this mechanism allows us to continue working while we checkpoint.
8054  * As a result, timing of actions is critical here and be careful to note that
8055  * this function will likely take minutes to execute on a busy system.
8056  */
8057 void
8058 CreateCheckPoint(int flags)
8059 {
8060         /* use volatile pointer to prevent code rearrangement */
8061         volatile XLogCtlData *xlogctl = XLogCtl;
8062         bool            shutdown;
8063         CheckPoint      checkPoint;
8064         XLogRecPtr      recptr;
8065         XLogCtlInsert *Insert = &XLogCtl->Insert;
8066         XLogRecData rdata;
8067         uint32          freespace;
8068         XLogSegNo       _logSegNo;
8069         XLogRecPtr      curInsert;
8070         VirtualTransactionId *vxids;
8071         int                     nvxids;
8072
8073         /*
8074          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8075          * issued at a different time.
8076          */
8077         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8078                 shutdown = true;
8079         else
8080                 shutdown = false;
8081
8082         /* sanity check */
8083         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8084                 elog(ERROR, "can't create a checkpoint during recovery");
8085
8086         /*
8087          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8088          * (This is just pro forma, since in the present system structure there is
8089          * only one process that is allowed to issue checkpoints at any given
8090          * time.)
8091          */
8092         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8093
8094         /*
8095          * Prepare to accumulate statistics.
8096          *
8097          * Note: because it is possible for log_checkpoints to change while a
8098          * checkpoint proceeds, we always accumulate stats, even if
8099          * log_checkpoints is currently off.
8100          */
8101         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8102         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8103
8104         /*
8105          * Use a critical section to force system panic if we have trouble.
8106          */
8107         START_CRIT_SECTION();
8108
8109         if (shutdown)
8110         {
8111                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8112                 ControlFile->state = DB_SHUTDOWNING;
8113                 ControlFile->time = (pg_time_t) time(NULL);
8114                 UpdateControlFile();
8115                 LWLockRelease(ControlFileLock);
8116         }
8117
8118         /*
8119          * Let smgr prepare for checkpoint; this has to happen before we determine
8120          * the REDO pointer.  Note that smgr must not do anything that'd have to
8121          * be undone if we decide no checkpoint is needed.
8122          */
8123         smgrpreckpt();
8124
8125         /* Begin filling in the checkpoint WAL record */
8126         MemSet(&checkPoint, 0, sizeof(checkPoint));
8127         checkPoint.time = (pg_time_t) time(NULL);
8128
8129         /*
8130          * For Hot Standby, derive the oldestActiveXid before we fix the redo
8131          * pointer. This allows us to begin accumulating changes to assemble our
8132          * starting snapshot of locks and transactions.
8133          */
8134         if (!shutdown && XLogStandbyInfoActive())
8135                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8136         else
8137                 checkPoint.oldestActiveXid = InvalidTransactionId;
8138
8139         /*
8140          * We must block concurrent insertions while examining insert state to
8141          * determine the checkpoint REDO pointer.
8142          */
8143         WALInsertSlotAcquire(true);
8144         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8145
8146         /*
8147          * If this isn't a shutdown or forced checkpoint, and we have not inserted
8148          * any XLOG records since the start of the last checkpoint, skip the
8149          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
8150          * when the system is idle. That wastes log space, and more importantly it
8151          * exposes us to possible loss of both current and previous checkpoint
8152          * records if the machine crashes just as we're writing the update.
8153          * (Perhaps it'd make even more sense to checkpoint only when the previous
8154          * checkpoint record is in a different xlog page?)
8155          *
8156          * We have to make two tests to determine that nothing has happened since
8157          * the start of the last checkpoint: current insertion point must match
8158          * the end of the last checkpoint record, and its redo pointer must point
8159          * to itself.
8160          */
8161         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8162                                   CHECKPOINT_FORCE)) == 0)
8163         {
8164                 if (curInsert == ControlFile->checkPoint +
8165                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
8166                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
8167                 {
8168                         WALInsertSlotRelease();
8169                         LWLockRelease(CheckpointLock);
8170                         END_CRIT_SECTION();
8171                         return;
8172                 }
8173         }
8174
8175         /*
8176          * An end-of-recovery checkpoint is created before anyone is allowed to
8177          * write WAL. To allow us to write the checkpoint record, temporarily
8178          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8179          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8180          */
8181         if (flags & CHECKPOINT_END_OF_RECOVERY)
8182                 LocalSetXLogInsertAllowed();
8183
8184         checkPoint.ThisTimeLineID = ThisTimeLineID;
8185         if (flags & CHECKPOINT_END_OF_RECOVERY)
8186                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8187         else
8188                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8189
8190         checkPoint.fullPageWrites = Insert->fullPageWrites;
8191
8192         /*
8193          * Compute new REDO record ptr = location of next XLOG record.
8194          *
8195          * NB: this is NOT necessarily where the checkpoint record itself will be,
8196          * since other backends may insert more XLOG records while we're off doing
8197          * the buffer flush work.  Those XLOG records are logically after the
8198          * checkpoint, even though physically before it.  Got that?
8199          */
8200         freespace = INSERT_FREESPACE(curInsert);
8201         if (freespace == 0)
8202         {
8203                 if (curInsert % XLogSegSize == 0)
8204                         curInsert += SizeOfXLogLongPHD;
8205                 else
8206                         curInsert += SizeOfXLogShortPHD;
8207         }
8208         checkPoint.redo = curInsert;
8209
8210         /*
8211          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8212          * must be done while holding the insertion slots.
8213          *
8214          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8215          * pointing past where it really needs to point.  This is okay; the only
8216          * consequence is that XLogInsert might back up whole buffers that it
8217          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8218          * XLogInserts that happen while we are dumping buffers must assume that
8219          * their buffer changes are not included in the checkpoint.
8220          */
8221         RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
8222
8223         /*
8224          * Now we can release the WAL insertion slots, allowing other xacts to
8225          * proceed while we are flushing disk buffers.
8226          */
8227         WALInsertSlotRelease();
8228
8229         /* Update the info_lck-protected copy of RedoRecPtr as well */
8230         SpinLockAcquire(&xlogctl->info_lck);
8231         xlogctl->RedoRecPtr = checkPoint.redo;
8232         SpinLockRelease(&xlogctl->info_lck);
8233
8234         /*
8235          * If enabled, log checkpoint start.  We postpone this until now so as not
8236          * to log anything if we decided to skip the checkpoint.
8237          */
8238         if (log_checkpoints)
8239                 LogCheckpointStart(flags, false);
8240
8241         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8242
8243         /*
8244          * In some cases there are groups of actions that must all occur on one
8245          * side or the other of a checkpoint record. Before flushing the
8246          * checkpoint record we must explicitly wait for any backend currently
8247          * performing those groups of actions.
8248          *
8249          * One example is end of transaction, so we must wait for any transactions
8250          * that are currently in commit critical sections.      If an xact inserted
8251          * its commit record into XLOG just before the REDO point, then a crash
8252          * restart from the REDO point would not replay that record, which means
8253          * that our flushing had better include the xact's update of pg_clog.  So
8254          * we wait till he's out of his commit critical section before proceeding.
8255          * See notes in RecordTransactionCommit().
8256          *
8257          * Because we've already released the insertion slots, this test is a bit
8258          * fuzzy: it is possible that we will wait for xacts we didn't really need
8259          * to wait for.  But the delay should be short and it seems better to make
8260          * checkpoint take a bit longer than to hold off insertions longer than
8261          * necessary.
8262          * (In fact, the whole reason we have this issue is that xact.c does
8263          * commit record XLOG insertion and clog update as two separate steps
8264          * protected by different locks, but again that seems best on grounds of
8265          * minimizing lock contention.)
8266          *
8267          * A transaction that has not yet set delayChkpt when we look cannot be at
8268          * risk, since he's not inserted his commit record yet; and one that's
8269          * already cleared it is not at risk either, since he's done fixing clog
8270          * and we will correctly flush the update below.  So we cannot miss any
8271          * xacts we need to wait for.
8272          */
8273         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8274         if (nvxids > 0)
8275         {
8276                 do
8277                 {
8278                         pg_usleep(10000L);      /* wait for 10 msec */
8279                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8280         }
8281         pfree(vxids);
8282
8283         /*
8284          * Get the other info we need for the checkpoint record.
8285          */
8286         LWLockAcquire(XidGenLock, LW_SHARED);
8287         checkPoint.nextXid = ShmemVariableCache->nextXid;
8288         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8289         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8290         LWLockRelease(XidGenLock);
8291
8292         /* Increase XID epoch if we've wrapped around since last checkpoint */
8293         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8294         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8295                 checkPoint.nextXidEpoch++;
8296
8297         LWLockAcquire(OidGenLock, LW_SHARED);
8298         checkPoint.nextOid = ShmemVariableCache->nextOid;
8299         if (!shutdown)
8300                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8301         LWLockRelease(OidGenLock);
8302
8303         MultiXactGetCheckptMulti(shutdown,
8304                                                          &checkPoint.nextMulti,
8305                                                          &checkPoint.nextMultiOffset,
8306                                                          &checkPoint.oldestMulti,
8307                                                          &checkPoint.oldestMultiDB);
8308
8309         /*
8310          * Having constructed the checkpoint record, ensure all shmem disk buffers
8311          * and commit-log buffers are flushed to disk.
8312          *
8313          * This I/O could fail for various reasons.  If so, we will fail to
8314          * complete the checkpoint, but there is no reason to force a system
8315          * panic. Accordingly, exit critical section while doing it.
8316          */
8317         END_CRIT_SECTION();
8318
8319         CheckPointGuts(checkPoint.redo, flags);
8320
8321         /*
8322          * Take a snapshot of running transactions and write this to WAL. This
8323          * allows us to reconstruct the state of running transactions during
8324          * archive recovery, if required. Skip, if this info disabled.
8325          *
8326          * If we are shutting down, or Startup process is completing crash
8327          * recovery we don't need to write running xact data.
8328          */
8329         if (!shutdown && XLogStandbyInfoActive())
8330                 LogStandbySnapshot();
8331
8332         START_CRIT_SECTION();
8333
8334         /*
8335          * Now insert the checkpoint record into XLOG.
8336          */
8337         rdata.data = (char *) (&checkPoint);
8338         rdata.len = sizeof(checkPoint);
8339         rdata.buffer = InvalidBuffer;
8340         rdata.next = NULL;
8341
8342         recptr = XLogInsert(RM_XLOG_ID,
8343                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8344                                                 XLOG_CHECKPOINT_ONLINE,
8345                                                 &rdata);
8346
8347         XLogFlush(recptr);
8348
8349         /*
8350          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8351          * overwritten at next startup.  No-one should even try, this just allows
8352          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8353          * to just temporarily disable writing until the system has exited
8354          * recovery.
8355          */
8356         if (shutdown)
8357         {
8358                 if (flags & CHECKPOINT_END_OF_RECOVERY)
8359                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
8360                 else
8361                         LocalXLogInsertAllowed = 0; /* never again write WAL */
8362         }
8363
8364         /*
8365          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8366          * = end of actual checkpoint record.
8367          */
8368         if (shutdown && checkPoint.redo != ProcLastRecPtr)
8369                 ereport(PANIC,
8370                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
8371
8372         /*
8373          * Select point at which we can truncate the log, which we base on the
8374          * prior checkpoint's earliest info.
8375          */
8376         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8377
8378         /*
8379          * Update the control file.
8380          */
8381         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8382         if (shutdown)
8383                 ControlFile->state = DB_SHUTDOWNED;
8384         ControlFile->prevCheckPoint = ControlFile->checkPoint;
8385         ControlFile->checkPoint = ProcLastRecPtr;
8386         ControlFile->checkPointCopy = checkPoint;
8387         ControlFile->time = (pg_time_t) time(NULL);
8388         /* crash recovery should always recover to the end of WAL */
8389         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8390         ControlFile->minRecoveryPointTLI = 0;
8391
8392         /*
8393          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8394          * unused on non-shutdown checkpoints, but seems useful to store it always
8395          * for debugging purposes.
8396          */
8397         SpinLockAcquire(&XLogCtl->ulsn_lck);
8398         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8399         SpinLockRelease(&XLogCtl->ulsn_lck);
8400
8401         UpdateControlFile();
8402         LWLockRelease(ControlFileLock);
8403
8404         /* Update shared-memory copy of checkpoint XID/epoch */
8405         {
8406                 /* use volatile pointer to prevent code rearrangement */
8407                 volatile XLogCtlData *xlogctl = XLogCtl;
8408
8409                 SpinLockAcquire(&xlogctl->info_lck);
8410                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8411                 xlogctl->ckptXid = checkPoint.nextXid;
8412                 SpinLockRelease(&xlogctl->info_lck);
8413         }
8414
8415         /*
8416          * We are now done with critical updates; no need for system panic if we
8417          * have trouble while fooling with old log segments.
8418          */
8419         END_CRIT_SECTION();
8420
8421         /*
8422          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8423          */
8424         smgrpostckpt();
8425
8426         /*
8427          * Delete old log files (those no longer needed even for previous
8428          * checkpoint or the standbys in XLOG streaming).
8429          */
8430         if (_logSegNo)
8431         {
8432                 KeepLogSeg(recptr, &_logSegNo);
8433                 _logSegNo--;
8434                 RemoveOldXlogFiles(_logSegNo, recptr);
8435         }
8436
8437         /*
8438          * Make more log segments if needed.  (Do this after recycling old log
8439          * segments, since that may supply some of the needed files.)
8440          */
8441         if (!shutdown)
8442                 PreallocXlogFiles(recptr);
8443
8444         /*
8445          * Truncate pg_subtrans if possible.  We can throw away all data before
8446          * the oldest XMIN of any running transaction.  No future transaction will
8447          * attempt to reference any pg_subtrans entry older than that (see Asserts
8448          * in subtrans.c).      During recovery, though, we mustn't do this because
8449          * StartupSUBTRANS hasn't been called yet.
8450          */
8451         if (!RecoveryInProgress())
8452                 TruncateSUBTRANS(GetOldestXmin(true, false));
8453
8454         /* Real work is done, but log and update stats before releasing lock. */
8455         LogCheckpointEnd(false);
8456
8457         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8458                                                                          NBuffers,
8459                                                                          CheckpointStats.ckpt_segs_added,
8460                                                                          CheckpointStats.ckpt_segs_removed,
8461                                                                          CheckpointStats.ckpt_segs_recycled);
8462
8463         LWLockRelease(CheckpointLock);
8464 }
8465
8466 /*
8467  * Mark the end of recovery in WAL though without running a full checkpoint.
8468  * We can expect that a restartpoint is likely to be in progress as we
8469  * do this, though we are unwilling to wait for it to complete. So be
8470  * careful to avoid taking the CheckpointLock anywhere here.
8471  *
8472  * CreateRestartPoint() allows for the case where recovery may end before
8473  * the restartpoint completes so there is no concern of concurrent behaviour.
8474  */
8475 void
8476 CreateEndOfRecoveryRecord(void)
8477 {
8478         xl_end_of_recovery xlrec;
8479         XLogRecData rdata;
8480         XLogRecPtr      recptr;
8481
8482         /* sanity check */
8483         if (!RecoveryInProgress())
8484                 elog(ERROR, "can only be used to end recovery");
8485
8486         xlrec.end_time = time(NULL);
8487
8488         WALInsertSlotAcquire(true);
8489         xlrec.ThisTimeLineID = ThisTimeLineID;
8490         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8491         WALInsertSlotRelease();
8492
8493         LocalSetXLogInsertAllowed();
8494
8495         START_CRIT_SECTION();
8496
8497         rdata.data = (char *) &xlrec;
8498         rdata.len = sizeof(xl_end_of_recovery);
8499         rdata.buffer = InvalidBuffer;
8500         rdata.next = NULL;
8501
8502         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
8503
8504         XLogFlush(recptr);
8505
8506         /*
8507          * Update the control file so that crash recovery can follow the timeline
8508          * changes to this point.
8509          */
8510         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8511         ControlFile->time = (pg_time_t) xlrec.end_time;
8512         ControlFile->minRecoveryPoint = recptr;
8513         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8514         UpdateControlFile();
8515         LWLockRelease(ControlFileLock);
8516
8517         END_CRIT_SECTION();
8518
8519         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8520 }
8521
8522 /*
8523  * Flush all data in shared memory to disk, and fsync
8524  *
8525  * This is the common code shared between regular checkpoints and
8526  * recovery restartpoints.
8527  */
8528 static void
8529 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8530 {
8531         CheckPointCLOG();
8532         CheckPointSUBTRANS();
8533         CheckPointMultiXact();
8534         CheckPointPredicate();
8535         CheckPointRelationMap();
8536         CheckPointBuffers(flags);       /* performs all required fsyncs */
8537         /* We deliberately delay 2PC checkpointing as long as possible */
8538         CheckPointTwoPhase(checkPointRedo);
8539 }
8540
8541 /*
8542  * Save a checkpoint for recovery restart if appropriate
8543  *
8544  * This function is called each time a checkpoint record is read from XLOG.
8545  * It must determine whether the checkpoint represents a safe restartpoint or
8546  * not.  If so, the checkpoint record is stashed in shared memory so that
8547  * CreateRestartPoint can consult it.  (Note that the latter function is
8548  * executed by the checkpointer, while this one will be executed by the
8549  * startup process.)
8550  */
8551 static void
8552 RecoveryRestartPoint(const CheckPoint *checkPoint)
8553 {
8554         int                     rmid;
8555
8556         /* use volatile pointer to prevent code rearrangement */
8557         volatile XLogCtlData *xlogctl = XLogCtl;
8558
8559         /*
8560          * Is it safe to restartpoint?  We must ask each of the resource managers
8561          * whether they have any partial state information that might prevent a
8562          * correct restart from this point.  If so, we skip this opportunity, but
8563          * return at the next checkpoint record for another try.
8564          */
8565         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
8566         {
8567                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
8568                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
8569                         {
8570                                 elog(trace_recovery(DEBUG2),
8571                                          "RM %d not safe to record restart point at %X/%X",
8572                                          rmid,
8573                                          (uint32) (checkPoint->redo >> 32),
8574                                          (uint32) checkPoint->redo);
8575                                 return;
8576                         }
8577         }
8578
8579         /*
8580          * Also refrain from creating a restartpoint if we have seen any
8581          * references to non-existent pages. Restarting recovery from the
8582          * restartpoint would not see the references, so we would lose the
8583          * cross-check that the pages belonged to a relation that was dropped
8584          * later.
8585          */
8586         if (XLogHaveInvalidPages())
8587         {
8588                 elog(trace_recovery(DEBUG2),
8589                          "could not record restart point at %X/%X because there "
8590                          "are unresolved references to invalid pages",
8591                          (uint32) (checkPoint->redo >> 32),
8592                          (uint32) checkPoint->redo);
8593                 return;
8594         }
8595
8596         /*
8597          * Copy the checkpoint record to shared memory, so that checkpointer can
8598          * work out the next time it wants to perform a restartpoint.
8599          */
8600         SpinLockAcquire(&xlogctl->info_lck);
8601         xlogctl->lastCheckPointRecPtr = ReadRecPtr;
8602         xlogctl->lastCheckPoint = *checkPoint;
8603         SpinLockRelease(&xlogctl->info_lck);
8604 }
8605
8606 /*
8607  * Establish a restartpoint if possible.
8608  *
8609  * This is similar to CreateCheckPoint, but is used during WAL recovery
8610  * to establish a point from which recovery can roll forward without
8611  * replaying the entire recovery log.
8612  *
8613  * Returns true if a new restartpoint was established. We can only establish
8614  * a restartpoint if we have replayed a safe checkpoint record since last
8615  * restartpoint.
8616  */
8617 bool
8618 CreateRestartPoint(int flags)
8619 {
8620         XLogRecPtr      lastCheckPointRecPtr;
8621         CheckPoint      lastCheckPoint;
8622         XLogSegNo       _logSegNo;
8623         TimestampTz xtime;
8624
8625         /* use volatile pointer to prevent code rearrangement */
8626         volatile XLogCtlData *xlogctl = XLogCtl;
8627
8628         /*
8629          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8630          * happens at a time.
8631          */
8632         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8633
8634         /* Get a local copy of the last safe checkpoint record. */
8635         SpinLockAcquire(&xlogctl->info_lck);
8636         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
8637         lastCheckPoint = xlogctl->lastCheckPoint;
8638         SpinLockRelease(&xlogctl->info_lck);
8639
8640         /*
8641          * Check that we're still in recovery mode. It's ok if we exit recovery
8642          * mode after this check, the restart point is valid anyway.
8643          */
8644         if (!RecoveryInProgress())
8645         {
8646                 ereport(DEBUG2,
8647                           (errmsg("skipping restartpoint, recovery has already ended")));
8648                 LWLockRelease(CheckpointLock);
8649                 return false;
8650         }
8651
8652         /*
8653          * If the last checkpoint record we've replayed is already our last
8654          * restartpoint, we can't perform a new restart point. We still update
8655          * minRecoveryPoint in that case, so that if this is a shutdown restart
8656          * point, we won't start up earlier than before. That's not strictly
8657          * necessary, but when hot standby is enabled, it would be rather weird if
8658          * the database opened up for read-only connections at a point-in-time
8659          * before the last shutdown. Such time travel is still possible in case of
8660          * immediate shutdown, though.
8661          *
8662          * We don't explicitly advance minRecoveryPoint when we do create a
8663          * restartpoint. It's assumed that flushing the buffers will do that as a
8664          * side-effect.
8665          */
8666         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8667                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
8668         {
8669                 ereport(DEBUG2,
8670                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8671                                                 (uint32) (lastCheckPoint.redo >> 32),
8672                                                 (uint32) lastCheckPoint.redo)));
8673
8674                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8675                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8676                 {
8677                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8678                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8679                         ControlFile->time = (pg_time_t) time(NULL);
8680                         UpdateControlFile();
8681                         LWLockRelease(ControlFileLock);
8682                 }
8683                 LWLockRelease(CheckpointLock);
8684                 return false;
8685         }
8686
8687         /*
8688          * Update the shared RedoRecPtr so that the startup process can calculate
8689          * the number of segments replayed since last restartpoint, and request a
8690          * restartpoint if it exceeds checkpoint_segments.
8691          *
8692          * Like in CreateCheckPoint(), hold off insertions to update it, although
8693          * during recovery this is just pro forma, because no WAL insertions are
8694          * happening.
8695          */
8696         WALInsertSlotAcquire(true);
8697         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
8698         WALInsertSlotRelease();
8699
8700         /* Also update the info_lck-protected copy */
8701         SpinLockAcquire(&xlogctl->info_lck);
8702         xlogctl->RedoRecPtr = lastCheckPoint.redo;
8703         SpinLockRelease(&xlogctl->info_lck);
8704
8705         /*
8706          * Prepare to accumulate statistics.
8707          *
8708          * Note: because it is possible for log_checkpoints to change while a
8709          * checkpoint proceeds, we always accumulate stats, even if
8710          * log_checkpoints is currently off.
8711          */
8712         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8713         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8714
8715         if (log_checkpoints)
8716                 LogCheckpointStart(flags, true);
8717
8718         CheckPointGuts(lastCheckPoint.redo, flags);
8719
8720         /*
8721          * Select point at which we can truncate the xlog, which we base on the
8722          * prior checkpoint's earliest info.
8723          */
8724         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8725
8726         /*
8727          * Update pg_control, using current time.  Check that it still shows
8728          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8729          * this is a quick hack to make sure nothing really bad happens if somehow
8730          * we get here after the end-of-recovery checkpoint.
8731          */
8732         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8733         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8734                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
8735         {
8736                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8737                 ControlFile->checkPoint = lastCheckPointRecPtr;
8738                 ControlFile->checkPointCopy = lastCheckPoint;
8739                 ControlFile->time = (pg_time_t) time(NULL);
8740                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8741                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8742                 UpdateControlFile();
8743         }
8744         LWLockRelease(ControlFileLock);
8745
8746         /*
8747          * Due to an historical accident multixact truncations are not WAL-logged,
8748          * but just performed everytime the mxact horizon is increased. So, unless
8749          * we explicitly execute truncations on a standby it will never clean out
8750          * /pg_multixact which obviously is bad, both because it uses space and
8751          * because we can wrap around into pre-existing data...
8752          *
8753          * We can only do the truncation here, after the UpdateControlFile()
8754          * above, because we've now safely established a restart point, that
8755          * guarantees we will not need need to access those multis.
8756          *
8757          * It's probably worth improving this.
8758          */
8759         TruncateMultiXact(lastCheckPoint.oldestMulti);
8760
8761         /*
8762          * Delete old log files (those no longer needed even for previous
8763          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8764          * growing full.
8765          */
8766         if (_logSegNo)
8767         {
8768                 XLogRecPtr      receivePtr;
8769                 XLogRecPtr      replayPtr;
8770                 TimeLineID      replayTLI;
8771                 XLogRecPtr      endptr;
8772
8773                 /*
8774                  * Get the current end of xlog replayed or received, whichever is
8775                  * later.
8776                  */
8777                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
8778                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
8779                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
8780
8781                 KeepLogSeg(endptr, &_logSegNo);
8782                 _logSegNo--;
8783
8784                 /*
8785                  * Try to recycle segments on a useful timeline. If we've been promoted
8786                  * since the beginning of this restartpoint, use the new timeline
8787                  * chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
8788                  * in that case). If we're still in recovery, use the timeline we're
8789                  * currently replaying.
8790                  *
8791                  * There is no guarantee that the WAL segments will be useful on the
8792                  * current timeline; if recovery proceeds to a new timeline right
8793                  * after this, the pre-allocated WAL segments on this timeline will
8794                  * not be used, and will go wasted until recycled on the next
8795                  * restartpoint. We'll live with that.
8796                  */
8797                 if (RecoveryInProgress())
8798                         ThisTimeLineID = replayTLI;
8799
8800                 RemoveOldXlogFiles(_logSegNo, endptr);
8801
8802                 /*
8803                  * Make more log segments if needed.  (Do this after recycling old log
8804                  * segments, since that may supply some of the needed files.)
8805                  */
8806                 PreallocXlogFiles(endptr);
8807
8808                 /*
8809                  * ThisTimeLineID is normally not set when we're still in recovery.
8810                  * However, recycling/preallocating segments above needed
8811                  * ThisTimeLineID to determine which timeline to install the segments
8812                  * on. Reset it now, to restore the normal state of affairs for
8813                  * debugging purposes.
8814                  */
8815                 if (RecoveryInProgress())
8816                         ThisTimeLineID = 0;
8817         }
8818
8819         /*
8820          * Truncate pg_subtrans if possible.  We can throw away all data before
8821          * the oldest XMIN of any running transaction.  No future transaction will
8822          * attempt to reference any pg_subtrans entry older than that (see Asserts
8823          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
8824          * this because StartupSUBTRANS hasn't been called yet.
8825          */
8826         if (EnableHotStandby)
8827                 TruncateSUBTRANS(GetOldestXmin(true, false));
8828
8829         /* Real work is done, but log and update before releasing lock. */
8830         LogCheckpointEnd(true);
8831
8832         xtime = GetLatestXTime();
8833         ereport((log_checkpoints ? LOG : DEBUG2),
8834                         (errmsg("recovery restart point at %X/%X",
8835                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
8836                    xtime ? errdetail("last completed transaction was at log time %s",
8837                                                          timestamptz_to_str(xtime)) : 0));
8838
8839         LWLockRelease(CheckpointLock);
8840
8841         /*
8842          * Finally, execute archive_cleanup_command, if any.
8843          */
8844         if (XLogCtl->archiveCleanupCommand[0])
8845                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8846                                                            "archive_cleanup_command",
8847                                                            false);
8848
8849         return true;
8850 }
8851
8852 /*
8853  * Retreat *logSegNo to the last segment that we need to retain because of
8854  * wal_keep_segments. This is calculated by subtracting wal_keep_segments
8855  * from the given xlog location, recptr.
8856  */
8857 static void
8858 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
8859 {
8860         XLogSegNo       segno;
8861
8862         if (wal_keep_segments == 0)
8863                 return;
8864
8865         XLByteToSeg(recptr, segno);
8866
8867         /* avoid underflow, don't go below 1 */
8868         if (segno <= wal_keep_segments)
8869                 segno = 1;
8870         else
8871                 segno = segno - wal_keep_segments;
8872
8873         /* don't delete WAL segments newer than the calculated segment */
8874         if (segno < *logSegNo)
8875                 *logSegNo = segno;
8876 }
8877
8878 /*
8879  * Write a NEXTOID log record
8880  */
8881 void
8882 XLogPutNextOid(Oid nextOid)
8883 {
8884         XLogRecData rdata;
8885
8886         rdata.data = (char *) (&nextOid);
8887         rdata.len = sizeof(Oid);
8888         rdata.buffer = InvalidBuffer;
8889         rdata.next = NULL;
8890         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
8891
8892         /*
8893          * We need not flush the NEXTOID record immediately, because any of the
8894          * just-allocated OIDs could only reach disk as part of a tuple insert or
8895          * update that would have its own XLOG record that must follow the NEXTOID
8896          * record.      Therefore, the standard buffer LSN interlock applied to those
8897          * records will ensure no such OID reaches disk before the NEXTOID record
8898          * does.
8899          *
8900          * Note, however, that the above statement only covers state "within" the
8901          * database.  When we use a generated OID as a file or directory name, we
8902          * are in a sense violating the basic WAL rule, because that filesystem
8903          * change may reach disk before the NEXTOID WAL record does.  The impact
8904          * of this is that if a database crash occurs immediately afterward, we
8905          * might after restart re-generate the same OID and find that it conflicts
8906          * with the leftover file or directory.  But since for safety's sake we
8907          * always loop until finding a nonconflicting filename, this poses no real
8908          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8909          */
8910 }
8911
8912 /*
8913  * Write an XLOG SWITCH record.
8914  *
8915  * Here we just blindly issue an XLogInsert request for the record.
8916  * All the magic happens inside XLogInsert.
8917  *
8918  * The return value is either the end+1 address of the switch record,
8919  * or the end+1 address of the prior segment if we did not need to
8920  * write a switch record because we are already at segment start.
8921  */
8922 XLogRecPtr
8923 RequestXLogSwitch(void)
8924 {
8925         XLogRecPtr      RecPtr;
8926         XLogRecData rdata;
8927
8928         /* XLOG SWITCH, alone among xlog record types, has no data */
8929         rdata.buffer = InvalidBuffer;
8930         rdata.data = NULL;
8931         rdata.len = 0;
8932         rdata.next = NULL;
8933
8934         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
8935
8936         return RecPtr;
8937 }
8938
8939 /*
8940  * Write a RESTORE POINT record
8941  */
8942 XLogRecPtr
8943 XLogRestorePoint(const char *rpName)
8944 {
8945         XLogRecPtr      RecPtr;
8946         XLogRecData rdata;
8947         xl_restore_point xlrec;
8948
8949         xlrec.rp_time = GetCurrentTimestamp();
8950         strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8951
8952         rdata.buffer = InvalidBuffer;
8953         rdata.data = (char *) &xlrec;
8954         rdata.len = sizeof(xl_restore_point);
8955         rdata.next = NULL;
8956
8957         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
8958
8959         ereport(LOG,
8960                         (errmsg("restore point \"%s\" created at %X/%X",
8961                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
8962
8963         return RecPtr;
8964 }
8965
8966 /*
8967  * Write a backup block if needed when we are setting a hint. Note that
8968  * this may be called for a variety of page types, not just heaps.
8969  *
8970  * Callable while holding just share lock on the buffer content.
8971  *
8972  * We can't use the plain backup block mechanism since that relies on the
8973  * Buffer being exclusively locked. Since some modifications (setting LSN, hint
8974  * bits) are allowed in a sharelocked buffer that can lead to wal checksum
8975  * failures. So instead we copy the page and insert the copied data as normal
8976  * record data.
8977  *
8978  * We only need to do something if page has not yet been full page written in
8979  * this checkpoint round. The LSN of the inserted wal record is returned if we
8980  * had to write, InvalidXLogRecPtr otherwise.
8981  *
8982  * It is possible that multiple concurrent backends could attempt to write WAL
8983  * records. In that case, multiple copies of the same block would be recorded
8984  * in separate WAL records by different backends, though that is still OK from
8985  * a correctness perspective.
8986  */
8987 XLogRecPtr
8988 XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
8989 {
8990         XLogRecPtr      recptr = InvalidXLogRecPtr;
8991         XLogRecPtr      lsn;
8992         XLogRecData rdata[2];
8993         BkpBlock        bkpb;
8994
8995         /*
8996          * Ensure no checkpoint can change our view of RedoRecPtr.
8997          */
8998         Assert(MyPgXact->delayChkpt);
8999
9000         /*
9001          * Update RedoRecPtr so XLogCheckBuffer can make the right decision
9002          */
9003         GetRedoRecPtr();
9004
9005         /*
9006          * Setup phony rdata element for use within XLogCheckBuffer only. We reuse
9007          * and reset rdata for any actual WAL record insert.
9008          */
9009         rdata[0].buffer = buffer;
9010         rdata[0].buffer_std = buffer_std;
9011
9012         /*
9013          * Check buffer while not holding an exclusive lock.
9014          */
9015         if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
9016         {
9017                 char            copied_buffer[BLCKSZ];
9018                 char       *origdata = (char *) BufferGetBlock(buffer);
9019
9020                 /*
9021                  * Copy buffer so we don't have to worry about concurrent hint bit or
9022                  * lsn updates. We assume pd_lower/upper cannot be changed without an
9023                  * exclusive lock, so the contents bkp are not racy.
9024                  *
9025                  * With buffer_std set to false, XLogCheckBuffer() sets hole_length and
9026                  * hole_offset to 0; so the following code is safe for either case.
9027                  */
9028                 memcpy(copied_buffer, origdata, bkpb.hole_offset);
9029                 memcpy(copied_buffer + bkpb.hole_offset,
9030                            origdata + bkpb.hole_offset + bkpb.hole_length,
9031                            BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
9032
9033                 /*
9034                  * Header for backup block.
9035                  */
9036                 rdata[0].data = (char *) &bkpb;
9037                 rdata[0].len = sizeof(BkpBlock);
9038                 rdata[0].buffer = InvalidBuffer;
9039                 rdata[0].next = &(rdata[1]);
9040
9041                 /*
9042                  * Save copy of the buffer.
9043                  */
9044                 rdata[1].data = copied_buffer;
9045                 rdata[1].len = BLCKSZ - bkpb.hole_length;
9046                 rdata[1].buffer = InvalidBuffer;
9047                 rdata[1].next = NULL;
9048
9049                 recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata);
9050         }
9051
9052         return recptr;
9053 }
9054
9055 /*
9056  * Check if any of the GUC parameters that are critical for hot standby
9057  * have changed, and update the value in pg_control file if necessary.
9058  */
9059 static void
9060 XLogReportParameters(void)
9061 {
9062         if (wal_level != ControlFile->wal_level ||
9063                 wal_log_hints != ControlFile->wal_log_hints ||
9064                 MaxConnections != ControlFile->MaxConnections ||
9065                 max_worker_processes != ControlFile->max_worker_processes ||
9066                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9067                 max_locks_per_xact != ControlFile->max_locks_per_xact)
9068         {
9069                 /*
9070                  * The change in number of backend slots doesn't need to be WAL-logged
9071                  * if archiving is not enabled, as you can't start archive recovery
9072                  * with wal_level=minimal anyway. We don't really care about the
9073                  * values in pg_control either if wal_level=minimal, but seems better
9074                  * to keep them up-to-date to avoid confusion.
9075                  */
9076                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9077                 {
9078                         XLogRecData rdata;
9079                         xl_parameter_change xlrec;
9080
9081                         xlrec.MaxConnections = MaxConnections;
9082                         xlrec.max_worker_processes = max_worker_processes;
9083                         xlrec.max_prepared_xacts = max_prepared_xacts;
9084                         xlrec.max_locks_per_xact = max_locks_per_xact;
9085                         xlrec.wal_level = wal_level;
9086                         xlrec.wal_log_hints = wal_log_hints;
9087
9088                         rdata.buffer = InvalidBuffer;
9089                         rdata.data = (char *) &xlrec;
9090                         rdata.len = sizeof(xlrec);
9091                         rdata.next = NULL;
9092
9093                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
9094                 }
9095
9096                 ControlFile->MaxConnections = MaxConnections;
9097                 ControlFile->max_worker_processes = max_worker_processes;
9098                 ControlFile->max_prepared_xacts = max_prepared_xacts;
9099                 ControlFile->max_locks_per_xact = max_locks_per_xact;
9100                 ControlFile->wal_level = wal_level;
9101                 ControlFile->wal_log_hints = wal_log_hints;
9102                 UpdateControlFile();
9103         }
9104 }
9105
9106 /*
9107  * Update full_page_writes in shared memory, and write an
9108  * XLOG_FPW_CHANGE record if necessary.
9109  *
9110  * Note: this function assumes there is no other process running
9111  * concurrently that could update it.
9112  */
9113 void
9114 UpdateFullPageWrites(void)
9115 {
9116         XLogCtlInsert *Insert = &XLogCtl->Insert;
9117
9118         /*
9119          * Do nothing if full_page_writes has not been changed.
9120          *
9121          * It's safe to check the shared full_page_writes without the lock,
9122          * because we assume that there is no concurrently running process which
9123          * can update it.
9124          */
9125         if (fullPageWrites == Insert->fullPageWrites)
9126                 return;
9127
9128         START_CRIT_SECTION();
9129
9130         /*
9131          * It's always safe to take full page images, even when not strictly
9132          * required, but not the other round. So if we're setting full_page_writes
9133          * to true, first set it true and then write the WAL record. If we're
9134          * setting it to false, first write the WAL record and then set the global
9135          * flag.
9136          */
9137         if (fullPageWrites)
9138         {
9139                 WALInsertSlotAcquire(true);
9140                 Insert->fullPageWrites = true;
9141                 WALInsertSlotRelease();
9142         }
9143
9144         /*
9145          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9146          * full_page_writes during archive recovery, if required.
9147          */
9148         if (XLogStandbyInfoActive() && !RecoveryInProgress())
9149         {
9150                 XLogRecData rdata;
9151
9152                 rdata.data = (char *) (&fullPageWrites);
9153                 rdata.len = sizeof(bool);
9154                 rdata.buffer = InvalidBuffer;
9155                 rdata.next = NULL;
9156
9157                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
9158         }
9159
9160         if (!fullPageWrites)
9161         {
9162                 WALInsertSlotAcquire(true);
9163                 Insert->fullPageWrites = false;
9164                 WALInsertSlotRelease();
9165         }
9166         END_CRIT_SECTION();
9167 }
9168
9169 /*
9170  * Check that it's OK to switch to new timeline during recovery.
9171  *
9172  * 'lsn' is the address of the shutdown checkpoint record we're about to
9173  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9174  */
9175 static void
9176 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9177 {
9178         /* Check that the record agrees on what the current (old) timeline is */
9179         if (prevTLI != ThisTimeLineID)
9180                 ereport(PANIC,
9181                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9182                                                 prevTLI, ThisTimeLineID)));
9183
9184         /*
9185          * The new timeline better be in the list of timelines we expect to see,
9186          * according to the timeline history. It should also not decrease.
9187          */
9188         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9189                 ereport(PANIC,
9190                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9191                                  newTLI, ThisTimeLineID)));
9192
9193         /*
9194          * If we have not yet reached min recovery point, and we're about to
9195          * switch to a timeline greater than the timeline of the min recovery
9196          * point: trouble. After switching to the new timeline, we could not
9197          * possibly visit the min recovery point on the correct timeline anymore.
9198          * This can happen if there is a newer timeline in the archive that
9199          * branched before the timeline the min recovery point is on, and you
9200          * attempt to do PITR to the new timeline.
9201          */
9202         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9203                 lsn < minRecoveryPoint &&
9204                 newTLI > minRecoveryPointTLI)
9205                 ereport(PANIC,
9206                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9207                                                 newTLI,
9208                                                 (uint32) (minRecoveryPoint >> 32),
9209                                                 (uint32) minRecoveryPoint,
9210                                                 minRecoveryPointTLI)));
9211
9212         /* Looks good */
9213 }
9214
9215 /*
9216  * XLOG resource manager's routines
9217  *
9218  * Definitions of info values are in include/catalog/pg_control.h, though
9219  * not all record types are related to control file updates.
9220  */
9221 void
9222 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
9223 {
9224         uint8           info = record->xl_info & ~XLR_INFO_MASK;
9225
9226         /* Backup blocks are not used by XLOG rmgr */
9227         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
9228
9229         if (info == XLOG_NEXTOID)
9230         {
9231                 Oid                     nextOid;
9232
9233                 /*
9234                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9235                  * and the recorded nextOid, but that fails if the OID counter wraps
9236                  * around.      Since no OID allocation should be happening during replay
9237                  * anyway, better to just believe the record exactly.  We still take
9238                  * OidGenLock while setting the variable, just in case.
9239                  */
9240                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9241                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9242                 ShmemVariableCache->nextOid = nextOid;
9243                 ShmemVariableCache->oidCount = 0;
9244                 LWLockRelease(OidGenLock);
9245         }
9246         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9247         {
9248                 CheckPoint      checkPoint;
9249
9250                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9251                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9252                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9253                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9254                 LWLockRelease(XidGenLock);
9255                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9256                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9257                 ShmemVariableCache->oidCount = 0;
9258                 LWLockRelease(OidGenLock);
9259                 MultiXactSetNextMXact(checkPoint.nextMulti,
9260                                                           checkPoint.nextMultiOffset);
9261                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9262                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
9263
9264                 /*
9265                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9266                  * record, the backup was canceled and the end-of-backup record will
9267                  * never arrive.
9268                  */
9269                 if (ArchiveRecoveryRequested &&
9270                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9271                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9272                         ereport(PANIC,
9273                         (errmsg("online backup was canceled, recovery cannot continue")));
9274
9275                 /*
9276                  * If we see a shutdown checkpoint, we know that nothing was running
9277                  * on the master at this point. So fake-up an empty running-xacts
9278                  * record and use that here and now. Recover additional standby state
9279                  * for prepared transactions.
9280                  */
9281                 if (standbyState >= STANDBY_INITIALIZED)
9282                 {
9283                         TransactionId *xids;
9284                         int                     nxids;
9285                         TransactionId oldestActiveXID;
9286                         TransactionId latestCompletedXid;
9287                         RunningTransactionsData running;
9288
9289                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9290
9291                         /*
9292                          * Construct a RunningTransactions snapshot representing a shut
9293                          * down server, with only prepared transactions still alive. We're
9294                          * never overflowed at this point because all subxids are listed
9295                          * with their parent prepared transactions.
9296                          */
9297                         running.xcnt = nxids;
9298                         running.subxcnt = 0;
9299                         running.subxid_overflow = false;
9300                         running.nextXid = checkPoint.nextXid;
9301                         running.oldestRunningXid = oldestActiveXID;
9302                         latestCompletedXid = checkPoint.nextXid;
9303                         TransactionIdRetreat(latestCompletedXid);
9304                         Assert(TransactionIdIsNormal(latestCompletedXid));
9305                         running.latestCompletedXid = latestCompletedXid;
9306                         running.xids = xids;
9307
9308                         ProcArrayApplyRecoveryInfo(&running);
9309
9310                         StandbyRecoverPreparedTransactions(true);
9311                 }
9312
9313                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9314                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9315                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9316
9317                 /* Update shared-memory copy of checkpoint XID/epoch */
9318                 {
9319                         /* use volatile pointer to prevent code rearrangement */
9320                         volatile XLogCtlData *xlogctl = XLogCtl;
9321
9322                         SpinLockAcquire(&xlogctl->info_lck);
9323                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9324                         xlogctl->ckptXid = checkPoint.nextXid;
9325                         SpinLockRelease(&xlogctl->info_lck);
9326                 }
9327
9328                 /*
9329                  * We should've already switched to the new TLI before replaying this
9330                  * record.
9331                  */
9332                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9333                         ereport(PANIC,
9334                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9335                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9336
9337                 RecoveryRestartPoint(&checkPoint);
9338         }
9339         else if (info == XLOG_CHECKPOINT_ONLINE)
9340         {
9341                 CheckPoint      checkPoint;
9342
9343                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9344                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9345                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9346                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9347                                                                   checkPoint.nextXid))
9348                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9349                 LWLockRelease(XidGenLock);
9350                 /* ... but still treat OID counter as exact */
9351                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9352                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9353                 ShmemVariableCache->oidCount = 0;
9354                 LWLockRelease(OidGenLock);
9355                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9356                                                                   checkPoint.nextMultiOffset);
9357                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9358                                                                   checkPoint.oldestXid))
9359                         SetTransactionIdLimit(checkPoint.oldestXid,
9360                                                                   checkPoint.oldestXidDB);
9361                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9362                                                            checkPoint.oldestMultiDB);
9363
9364                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9365                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9366                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9367
9368                 /* Update shared-memory copy of checkpoint XID/epoch */
9369                 {
9370                         /* use volatile pointer to prevent code rearrangement */
9371                         volatile XLogCtlData *xlogctl = XLogCtl;
9372
9373                         SpinLockAcquire(&xlogctl->info_lck);
9374                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9375                         xlogctl->ckptXid = checkPoint.nextXid;
9376                         SpinLockRelease(&xlogctl->info_lck);
9377                 }
9378
9379                 /* TLI should not change in an on-line checkpoint */
9380                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9381                         ereport(PANIC,
9382                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9383                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9384
9385                 RecoveryRestartPoint(&checkPoint);
9386         }
9387         else if (info == XLOG_END_OF_RECOVERY)
9388         {
9389                 xl_end_of_recovery xlrec;
9390
9391                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9392
9393                 /*
9394                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9395                  * but this case is rarer and harder to test, so the benefit doesn't
9396                  * outweigh the potential extra cost of maintenance.
9397                  */
9398
9399                 /*
9400                  * We should've already switched to the new TLI before replaying this
9401                  * record.
9402                  */
9403                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9404                         ereport(PANIC,
9405                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9406                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9407         }
9408         else if (info == XLOG_NOOP)
9409         {
9410                 /* nothing to do here */
9411         }
9412         else if (info == XLOG_SWITCH)
9413         {
9414                 /* nothing to do here */
9415         }
9416         else if (info == XLOG_RESTORE_POINT)
9417         {
9418                 /* nothing to do here */
9419         }
9420         else if (info == XLOG_FPI)
9421         {
9422                 char       *data;
9423                 BkpBlock        bkpb;
9424
9425                 /*
9426                  * Full-page image (FPI) records contain a backup block stored "inline"
9427                  * in the normal data since the locking when writing hint records isn't
9428                  * sufficient to use the normal backup block mechanism, which assumes
9429                  * exclusive lock on the buffer supplied.
9430                  *
9431                  * Since the only change in these backup block are hint bits, there
9432                  * are no recovery conflicts generated.
9433                  *
9434                  * This also means there is no corresponding API call for this, so an
9435                  * smgr implementation has no need to implement anything. Which means
9436                  * nothing is needed in md.c etc
9437                  */
9438                 data = XLogRecGetData(record);
9439                 memcpy(&bkpb, data, sizeof(BkpBlock));
9440                 data += sizeof(BkpBlock);
9441
9442                 RestoreBackupBlockContents(lsn, bkpb, data, false, false);
9443         }
9444         else if (info == XLOG_BACKUP_END)
9445         {
9446                 XLogRecPtr      startpoint;
9447
9448                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9449
9450                 if (ControlFile->backupStartPoint == startpoint)
9451                 {
9452                         /*
9453                          * We have reached the end of base backup, the point where
9454                          * pg_stop_backup() was done. The data on disk is now consistent.
9455                          * Reset backupStartPoint, and update minRecoveryPoint to make
9456                          * sure we don't allow starting up at an earlier point even if
9457                          * recovery is stopped and restarted soon after this.
9458                          */
9459                         elog(DEBUG1, "end of backup reached");
9460
9461                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9462
9463                         if (ControlFile->minRecoveryPoint < lsn)
9464                         {
9465                                 ControlFile->minRecoveryPoint = lsn;
9466                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9467                         }
9468                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
9469                         ControlFile->backupEndRequired = false;
9470                         UpdateControlFile();
9471
9472                         LWLockRelease(ControlFileLock);
9473                 }
9474         }
9475         else if (info == XLOG_PARAMETER_CHANGE)
9476         {
9477                 xl_parameter_change xlrec;
9478
9479                 /* Update our copy of the parameters in pg_control */
9480                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9481
9482                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9483                 ControlFile->MaxConnections = xlrec.MaxConnections;
9484                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9485                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9486                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9487                 ControlFile->wal_level = xlrec.wal_level;
9488                 ControlFile->wal_log_hints = wal_log_hints;
9489
9490                 /*
9491                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
9492                  * recover back up to this point before allowing hot standby again.
9493                  * This is particularly important if wal_level was set to 'archive'
9494                  * before, and is now 'hot_standby', to ensure you don't run queries
9495                  * against the WAL preceding the wal_level change. Same applies to
9496                  * decreasing max_* settings.
9497                  */
9498                 minRecoveryPoint = ControlFile->minRecoveryPoint;
9499                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9500                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9501                 {
9502                         ControlFile->minRecoveryPoint = lsn;
9503                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9504                 }
9505
9506                 UpdateControlFile();
9507                 LWLockRelease(ControlFileLock);
9508
9509                 /* Check to see if any changes to max_connections give problems */
9510                 CheckRequiredParameterValues();
9511         }
9512         else if (info == XLOG_FPW_CHANGE)
9513         {
9514                 /* use volatile pointer to prevent code rearrangement */
9515                 volatile XLogCtlData *xlogctl = XLogCtl;
9516                 bool            fpw;
9517
9518                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9519
9520                 /*
9521                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9522                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
9523                  * full_page_writes has been disabled during online backup.
9524                  */
9525                 if (!fpw)
9526                 {
9527                         SpinLockAcquire(&xlogctl->info_lck);
9528                         if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
9529                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
9530                         SpinLockRelease(&xlogctl->info_lck);
9531                 }
9532
9533                 /* Keep track of full_page_writes */
9534                 lastFullPageWrites = fpw;
9535         }
9536 }
9537
9538 #ifdef WAL_DEBUG
9539
9540 static void
9541 xlog_outrec(StringInfo buf, XLogRecord *record)
9542 {
9543         int                     i;
9544
9545         appendStringInfo(buf, "prev %X/%X; xid %u",
9546                                          (uint32) (record->xl_prev >> 32),
9547                                          (uint32) record->xl_prev,
9548                                          record->xl_xid);
9549
9550         appendStringInfo(buf, "; len %u",
9551                                          record->xl_len);
9552
9553         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
9554         {
9555                 if (record->xl_info & XLR_BKP_BLOCK(i))
9556                         appendStringInfo(buf, "; bkpb%d", i);
9557         }
9558
9559         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
9560 }
9561 #endif   /* WAL_DEBUG */
9562
9563
9564 /*
9565  * Return the (possible) sync flag used for opening a file, depending on the
9566  * value of the GUC wal_sync_method.
9567  */
9568 static int
9569 get_sync_bit(int method)
9570 {
9571         int                     o_direct_flag = 0;
9572
9573         /* If fsync is disabled, never open in sync mode */
9574         if (!enableFsync)
9575                 return 0;
9576
9577         /*
9578          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9579          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9580          * disabled, otherwise the archive command or walsender process will read
9581          * the WAL soon after writing it, which is guaranteed to cause a physical
9582          * read if we bypassed the kernel cache. We also skip the
9583          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9584          * reason.
9585          *
9586          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9587          * written by walreceiver is normally read by the startup process soon
9588          * after its written. Also, walreceiver performs unaligned writes, which
9589          * don't work with O_DIRECT, so it is required for correctness too.
9590          */
9591         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9592                 o_direct_flag = PG_O_DIRECT;
9593
9594         switch (method)
9595         {
9596                         /*
9597                          * enum values for all sync options are defined even if they are
9598                          * not supported on the current platform.  But if not, they are
9599                          * not included in the enum option array, and therefore will never
9600                          * be seen here.
9601                          */
9602                 case SYNC_METHOD_FSYNC:
9603                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9604                 case SYNC_METHOD_FDATASYNC:
9605                         return 0;
9606 #ifdef OPEN_SYNC_FLAG
9607                 case SYNC_METHOD_OPEN:
9608                         return OPEN_SYNC_FLAG | o_direct_flag;
9609 #endif
9610 #ifdef OPEN_DATASYNC_FLAG
9611                 case SYNC_METHOD_OPEN_DSYNC:
9612                         return OPEN_DATASYNC_FLAG | o_direct_flag;
9613 #endif
9614                 default:
9615                         /* can't happen (unless we are out of sync with option array) */
9616                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
9617                         return 0;                       /* silence warning */
9618         }
9619 }
9620
9621 /*
9622  * GUC support
9623  */
9624 void
9625 assign_xlog_sync_method(int new_sync_method, void *extra)
9626 {
9627         if (sync_method != new_sync_method)
9628         {
9629                 /*
9630                  * To ensure that no blocks escape unsynced, force an fsync on the
9631                  * currently open log segment (if any).  Also, if the open flag is
9632                  * changing, close the log file so it will be reopened (with new flag
9633                  * bit) at next use.
9634                  */
9635                 if (openLogFile >= 0)
9636                 {
9637                         if (pg_fsync(openLogFile) != 0)
9638                                 ereport(PANIC,
9639                                                 (errcode_for_file_access(),
9640                                                  errmsg("could not fsync log segment %s: %m",
9641                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
9642                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9643                                 XLogFileClose();
9644                 }
9645         }
9646 }
9647
9648
9649 /*
9650  * Issue appropriate kind of fsync (if any) for an XLOG output file.
9651  *
9652  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9653  * 'log' and 'seg' are for error reporting purposes.
9654  */
9655 void
9656 issue_xlog_fsync(int fd, XLogSegNo segno)
9657 {
9658         switch (sync_method)
9659         {
9660                 case SYNC_METHOD_FSYNC:
9661                         if (pg_fsync_no_writethrough(fd) != 0)
9662                                 ereport(PANIC,
9663                                                 (errcode_for_file_access(),
9664                                                  errmsg("could not fsync log file %s: %m",
9665                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9666                         break;
9667 #ifdef HAVE_FSYNC_WRITETHROUGH
9668                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9669                         if (pg_fsync_writethrough(fd) != 0)
9670                                 ereport(PANIC,
9671                                                 (errcode_for_file_access(),
9672                                           errmsg("could not fsync write-through log file %s: %m",
9673                                                          XLogFileNameP(ThisTimeLineID, segno))));
9674                         break;
9675 #endif
9676 #ifdef HAVE_FDATASYNC
9677                 case SYNC_METHOD_FDATASYNC:
9678                         if (pg_fdatasync(fd) != 0)
9679                                 ereport(PANIC,
9680                                                 (errcode_for_file_access(),
9681                                                  errmsg("could not fdatasync log file %s: %m",
9682                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9683                         break;
9684 #endif
9685                 case SYNC_METHOD_OPEN:
9686                 case SYNC_METHOD_OPEN_DSYNC:
9687                         /* write synced it already */
9688                         break;
9689                 default:
9690                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9691                         break;
9692         }
9693 }
9694
9695 /*
9696  * Return the filename of given log segment, as a palloc'd string.
9697  */
9698 char *
9699 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
9700 {
9701         char       *result = palloc(MAXFNAMELEN);
9702
9703         XLogFileName(result, tli, segno);
9704         return result;
9705 }
9706
9707 /*
9708  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9709  * function. It creates the necessary starting checkpoint and constructs the
9710  * backup label file.
9711  *
9712  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9713  * backup is started with pg_start_backup(), and there can be only one active
9714  * at a time. The backup label file of an exclusive backup is written to
9715  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9716  *
9717  * A non-exclusive backup is used for the streaming base backups (see
9718  * src/backend/replication/basebackup.c). The difference to exclusive backups
9719  * is that the backup label file is not written to disk. Instead, its would-be
9720  * contents are returned in *labelfile, and the caller is responsible for
9721  * including it in the backup archive as 'backup_label'. There can be many
9722  * non-exclusive backups active at the same time, and they don't conflict
9723  * with an exclusive backup either.
9724  *
9725  * Returns the minimum WAL position that must be present to restore from this
9726  * backup, and the corresponding timeline ID in *starttli_p.
9727  *
9728  * Every successfully started non-exclusive backup must be stopped by calling
9729  * do_pg_stop_backup() or do_pg_abort_backup().
9730  *
9731  * It is the responsibility of the caller of this function to verify the
9732  * permissions of the calling user!
9733  */
9734 XLogRecPtr
9735 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
9736                                    char **labelfile)
9737 {
9738         bool            exclusive = (labelfile == NULL);
9739         bool            backup_started_in_recovery = false;
9740         XLogRecPtr      checkpointloc;
9741         XLogRecPtr      startpoint;
9742         TimeLineID      starttli;
9743         pg_time_t       stamp_time;
9744         char            strfbuf[128];
9745         char            xlogfilename[MAXFNAMELEN];
9746         XLogSegNo       _logSegNo;
9747         struct stat stat_buf;
9748         FILE       *fp;
9749         StringInfoData labelfbuf;
9750
9751         backup_started_in_recovery = RecoveryInProgress();
9752
9753         /*
9754          * Currently only non-exclusive backup can be taken during recovery.
9755          */
9756         if (backup_started_in_recovery && exclusive)
9757                 ereport(ERROR,
9758                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9759                                  errmsg("recovery is in progress"),
9760                                  errhint("WAL control functions cannot be executed during recovery.")));
9761
9762         /*
9763          * During recovery, we don't need to check WAL level. Because, if WAL
9764          * level is not sufficient, it's impossible to get here during recovery.
9765          */
9766         if (!backup_started_in_recovery && !XLogIsNeeded())
9767                 ereport(ERROR,
9768                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9769                           errmsg("WAL level not sufficient for making an online backup"),
9770                                  errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
9771
9772         if (strlen(backupidstr) > MAXPGPATH)
9773                 ereport(ERROR,
9774                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9775                                  errmsg("backup label too long (max %d bytes)",
9776                                                 MAXPGPATH)));
9777
9778         /*
9779          * Mark backup active in shared memory.  We must do full-page WAL writes
9780          * during an on-line backup even if not doing so at other times, because
9781          * it's quite possible for the backup dump to obtain a "torn" (partially
9782          * written) copy of a database page if it reads the page concurrently with
9783          * our write to the same page.  This can be fixed as long as the first
9784          * write to the page in the WAL sequence is a full-page write. Hence, we
9785          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9786          * are no dirty pages in shared memory that might get dumped while the
9787          * backup is in progress without having a corresponding WAL record.  (Once
9788          * the backup is complete, we need not force full-page writes anymore,
9789          * since we expect that any pages not modified during the backup interval
9790          * must have been correctly captured by the backup.)
9791          *
9792          * Note that forcePageWrites has no effect during an online backup from
9793          * the standby.
9794          *
9795          * We must hold all the insertion slots to change the value of
9796          * forcePageWrites, to ensure adequate interlocking against XLogInsert().
9797          */
9798         WALInsertSlotAcquire(true);
9799         if (exclusive)
9800         {
9801                 if (XLogCtl->Insert.exclusiveBackup)
9802                 {
9803                         WALInsertSlotRelease();
9804                         ereport(ERROR,
9805                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9806                                          errmsg("a backup is already in progress"),
9807                                          errhint("Run pg_stop_backup() and try again.")));
9808                 }
9809                 XLogCtl->Insert.exclusiveBackup = true;
9810         }
9811         else
9812                 XLogCtl->Insert.nonExclusiveBackups++;
9813         XLogCtl->Insert.forcePageWrites = true;
9814         WALInsertSlotRelease();
9815
9816         /* Ensure we release forcePageWrites if fail below */
9817         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9818         {
9819                 bool            gotUniqueStartpoint = false;
9820
9821                 /*
9822                  * Force an XLOG file switch before the checkpoint, to ensure that the
9823                  * WAL segment the checkpoint is written to doesn't contain pages with
9824                  * old timeline IDs.  That would otherwise happen if you called
9825                  * pg_start_backup() right after restoring from a PITR archive: the
9826                  * first WAL segment containing the startup checkpoint has pages in
9827                  * the beginning with the old timeline ID.      That can cause trouble at
9828                  * recovery: we won't have a history file covering the old timeline if
9829                  * pg_xlog directory was not included in the base backup and the WAL
9830                  * archive was cleared too before starting the backup.
9831                  *
9832                  * This also ensures that we have emitted a WAL page header that has
9833                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9834                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
9835                  * compress out removable backup blocks, it won't remove any that
9836                  * occur after this point.
9837                  *
9838                  * During recovery, we skip forcing XLOG file switch, which means that
9839                  * the backup taken during recovery is not available for the special
9840                  * recovery case described above.
9841                  */
9842                 if (!backup_started_in_recovery)
9843                         RequestXLogSwitch();
9844
9845                 do
9846                 {
9847                         bool            checkpointfpw;
9848
9849                         /*
9850                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
9851                          * page problems, this guarantees that two successive backup runs
9852                          * will have different checkpoint positions and hence different
9853                          * history file names, even if nothing happened in between.
9854                          *
9855                          * During recovery, establish a restartpoint if possible. We use
9856                          * the last restartpoint as the backup starting checkpoint. This
9857                          * means that two successive backup runs can have same checkpoint
9858                          * positions.
9859                          *
9860                          * Since the fact that we are executing do_pg_start_backup()
9861                          * during recovery means that checkpointer is running, we can use
9862                          * RequestCheckpoint() to establish a restartpoint.
9863                          *
9864                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
9865                          * passing fast = true).  Otherwise this can take awhile.
9866                          */
9867                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9868                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
9869
9870                         /*
9871                          * Now we need to fetch the checkpoint record location, and also
9872                          * its REDO pointer.  The oldest point in WAL that would be needed
9873                          * to restore starting from the checkpoint is precisely the REDO
9874                          * pointer.
9875                          */
9876                         LWLockAcquire(ControlFileLock, LW_SHARED);
9877                         checkpointloc = ControlFile->checkPoint;
9878                         startpoint = ControlFile->checkPointCopy.redo;
9879                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
9880                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9881                         LWLockRelease(ControlFileLock);
9882
9883                         if (backup_started_in_recovery)
9884                         {
9885                                 /* use volatile pointer to prevent code rearrangement */
9886                                 volatile XLogCtlData *xlogctl = XLogCtl;
9887                                 XLogRecPtr      recptr;
9888
9889                                 /*
9890                                  * Check to see if all WAL replayed during online backup
9891                                  * (i.e., since last restartpoint used as backup starting
9892                                  * checkpoint) contain full-page writes.
9893                                  */
9894                                 SpinLockAcquire(&xlogctl->info_lck);
9895                                 recptr = xlogctl->lastFpwDisableRecPtr;
9896                                 SpinLockRelease(&xlogctl->info_lck);
9897
9898                                 if (!checkpointfpw || startpoint <= recptr)
9899                                         ereport(ERROR,
9900                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9901                                                    errmsg("WAL generated with full_page_writes=off was replayed "
9902                                                                   "since last restartpoint"),
9903                                                    errhint("This means that the backup being taken on the standby "
9904                                                                    "is corrupt and should not be used. "
9905                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
9906                                                                    "and then try an online backup again.")));
9907
9908                                 /*
9909                                  * During recovery, since we don't use the end-of-backup WAL
9910                                  * record and don't write the backup history file, the
9911                                  * starting WAL location doesn't need to be unique. This means
9912                                  * that two base backups started at the same time might use
9913                                  * the same checkpoint as starting locations.
9914                                  */
9915                                 gotUniqueStartpoint = true;
9916                         }
9917
9918                         /*
9919                          * If two base backups are started at the same time (in WAL sender
9920                          * processes), we need to make sure that they use different
9921                          * checkpoints as starting locations, because we use the starting
9922                          * WAL location as a unique identifier for the base backup in the
9923                          * end-of-backup WAL record and when we write the backup history
9924                          * file. Perhaps it would be better generate a separate unique ID
9925                          * for each backup instead of forcing another checkpoint, but
9926                          * taking a checkpoint right after another is not that expensive
9927                          * either because only few buffers have been dirtied yet.
9928                          */
9929                         WALInsertSlotAcquire(true);
9930                         if (XLogCtl->Insert.lastBackupStart < startpoint)
9931                         {
9932                                 XLogCtl->Insert.lastBackupStart = startpoint;
9933                                 gotUniqueStartpoint = true;
9934                         }
9935                         WALInsertSlotRelease();
9936                 } while (!gotUniqueStartpoint);
9937
9938                 XLByteToSeg(startpoint, _logSegNo);
9939                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
9940
9941                 /*
9942                  * Construct backup label file
9943                  */
9944                 initStringInfo(&labelfbuf);
9945
9946                 /* Use the log timezone here, not the session timezone */
9947                 stamp_time = (pg_time_t) time(NULL);
9948                 pg_strftime(strfbuf, sizeof(strfbuf),
9949                                         "%Y-%m-%d %H:%M:%S %Z",
9950                                         pg_localtime(&stamp_time, log_timezone));
9951                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
9952                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
9953                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
9954                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
9955                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
9956                                                  exclusive ? "pg_start_backup" : "streamed");
9957                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
9958                                                  backup_started_in_recovery ? "standby" : "master");
9959                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
9960                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
9961
9962                 /*
9963                  * Okay, write the file, or return its contents to caller.
9964                  */
9965                 if (exclusive)
9966                 {
9967                         /*
9968                          * Check for existing backup label --- implies a backup is already
9969                          * running.  (XXX given that we checked exclusiveBackup above,
9970                          * maybe it would be OK to just unlink any such label file?)
9971                          */
9972                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
9973                         {
9974                                 if (errno != ENOENT)
9975                                         ereport(ERROR,
9976                                                         (errcode_for_file_access(),
9977                                                          errmsg("could not stat file \"%s\": %m",
9978                                                                         BACKUP_LABEL_FILE)));
9979                         }
9980                         else
9981                                 ereport(ERROR,
9982                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9983                                                  errmsg("a backup is already in progress"),
9984                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
9985                                                                  BACKUP_LABEL_FILE)));
9986
9987                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
9988
9989                         if (!fp)
9990                                 ereport(ERROR,
9991                                                 (errcode_for_file_access(),
9992                                                  errmsg("could not create file \"%s\": %m",
9993                                                                 BACKUP_LABEL_FILE)));
9994                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
9995                                 fflush(fp) != 0 ||
9996                                 pg_fsync(fileno(fp)) != 0 ||
9997                                 ferror(fp) ||
9998                                 FreeFile(fp))
9999                                 ereport(ERROR,
10000                                                 (errcode_for_file_access(),
10001                                                  errmsg("could not write file \"%s\": %m",
10002                                                                 BACKUP_LABEL_FILE)));
10003                         pfree(labelfbuf.data);
10004                 }
10005                 else
10006                         *labelfile = labelfbuf.data;
10007         }
10008         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10009
10010         /*
10011          * We're done.  As a convenience, return the starting WAL location.
10012          */
10013         if (starttli_p)
10014                 *starttli_p = starttli;
10015         return startpoint;
10016 }
10017
10018 /* Error cleanup callback for pg_start_backup */
10019 static void
10020 pg_start_backup_callback(int code, Datum arg)
10021 {
10022         bool            exclusive = DatumGetBool(arg);
10023
10024         /* Update backup counters and forcePageWrites on failure */
10025         WALInsertSlotAcquire(true);
10026         if (exclusive)
10027         {
10028                 Assert(XLogCtl->Insert.exclusiveBackup);
10029                 XLogCtl->Insert.exclusiveBackup = false;
10030         }
10031         else
10032         {
10033                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10034                 XLogCtl->Insert.nonExclusiveBackups--;
10035         }
10036
10037         if (!XLogCtl->Insert.exclusiveBackup &&
10038                 XLogCtl->Insert.nonExclusiveBackups == 0)
10039         {
10040                 XLogCtl->Insert.forcePageWrites = false;
10041         }
10042         WALInsertSlotRelease();
10043 }
10044
10045 /*
10046  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10047  * function.
10048
10049  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10050  * the non-exclusive backup specified by 'labelfile'.
10051  *
10052  * Returns the last WAL position that must be present to restore from this
10053  * backup, and the corresponding timeline ID in *stoptli_p.
10054  *
10055  * It is the responsibility of the caller of this function to verify the
10056  * permissions of the calling user!
10057  */
10058 XLogRecPtr
10059 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10060 {
10061         bool            exclusive = (labelfile == NULL);
10062         bool            backup_started_in_recovery = false;
10063         XLogRecPtr      startpoint;
10064         XLogRecPtr      stoppoint;
10065         TimeLineID      stoptli;
10066         XLogRecData rdata;
10067         pg_time_t       stamp_time;
10068         char            strfbuf[128];
10069         char            histfilepath[MAXPGPATH];
10070         char            startxlogfilename[MAXFNAMELEN];
10071         char            stopxlogfilename[MAXFNAMELEN];
10072         char            lastxlogfilename[MAXFNAMELEN];
10073         char            histfilename[MAXFNAMELEN];
10074         char            backupfrom[20];
10075         XLogSegNo       _logSegNo;
10076         FILE       *lfp;
10077         FILE       *fp;
10078         char            ch;
10079         int                     seconds_before_warning;
10080         int                     waits = 0;
10081         bool            reported_waiting = false;
10082         char       *remaining;
10083         char       *ptr;
10084         uint32          hi,
10085                                 lo;
10086
10087         backup_started_in_recovery = RecoveryInProgress();
10088
10089         /*
10090          * Currently only non-exclusive backup can be taken during recovery.
10091          */
10092         if (backup_started_in_recovery && exclusive)
10093                 ereport(ERROR,
10094                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10095                                  errmsg("recovery is in progress"),
10096                                  errhint("WAL control functions cannot be executed during recovery.")));
10097
10098         /*
10099          * During recovery, we don't need to check WAL level. Because, if WAL
10100          * level is not sufficient, it's impossible to get here during recovery.
10101          */
10102         if (!backup_started_in_recovery && !XLogIsNeeded())
10103                 ereport(ERROR,
10104                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10105                           errmsg("WAL level not sufficient for making an online backup"),
10106                                  errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
10107
10108         /*
10109          * OK to update backup counters and forcePageWrites
10110          */
10111         WALInsertSlotAcquire(true);
10112         if (exclusive)
10113                 XLogCtl->Insert.exclusiveBackup = false;
10114         else
10115         {
10116                 /*
10117                  * The user-visible pg_start/stop_backup() functions that operate on
10118                  * exclusive backups can be called at any time, but for non-exclusive
10119                  * backups, it is expected that each do_pg_start_backup() call is
10120                  * matched by exactly one do_pg_stop_backup() call.
10121                  */
10122                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10123                 XLogCtl->Insert.nonExclusiveBackups--;
10124         }
10125
10126         if (!XLogCtl->Insert.exclusiveBackup &&
10127                 XLogCtl->Insert.nonExclusiveBackups == 0)
10128         {
10129                 XLogCtl->Insert.forcePageWrites = false;
10130         }
10131         WALInsertSlotRelease();
10132
10133         if (exclusive)
10134         {
10135                 /*
10136                  * Read the existing label file into memory.
10137                  */
10138                 struct stat statbuf;
10139                 int                     r;
10140
10141                 if (stat(BACKUP_LABEL_FILE, &statbuf))
10142                 {
10143                         if (errno != ENOENT)
10144                                 ereport(ERROR,
10145                                                 (errcode_for_file_access(),
10146                                                  errmsg("could not stat file \"%s\": %m",
10147                                                                 BACKUP_LABEL_FILE)));
10148                         ereport(ERROR,
10149                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10150                                          errmsg("a backup is not in progress")));
10151                 }
10152
10153                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10154                 if (!lfp)
10155                 {
10156                         ereport(ERROR,
10157                                         (errcode_for_file_access(),
10158                                          errmsg("could not read file \"%s\": %m",
10159                                                         BACKUP_LABEL_FILE)));
10160                 }
10161                 labelfile = palloc(statbuf.st_size + 1);
10162                 r = fread(labelfile, statbuf.st_size, 1, lfp);
10163                 labelfile[statbuf.st_size] = '\0';
10164
10165                 /*
10166                  * Close and remove the backup label file
10167                  */
10168                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
10169                         ereport(ERROR,
10170                                         (errcode_for_file_access(),
10171                                          errmsg("could not read file \"%s\": %m",
10172                                                         BACKUP_LABEL_FILE)));
10173                 if (unlink(BACKUP_LABEL_FILE) != 0)
10174                         ereport(ERROR,
10175                                         (errcode_for_file_access(),
10176                                          errmsg("could not remove file \"%s\": %m",
10177                                                         BACKUP_LABEL_FILE)));
10178         }
10179
10180         /*
10181          * Read and parse the START WAL LOCATION line (this code is pretty crude,
10182          * but we are not expecting any variability in the file format).
10183          */
10184         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10185                            &hi, &lo, startxlogfilename,
10186                            &ch) != 4 || ch != '\n')
10187                 ereport(ERROR,
10188                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10189                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10190         startpoint = ((uint64) hi) << 32 | lo;
10191         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
10192
10193         /*
10194          * Parse the BACKUP FROM line. If we are taking an online backup from the
10195          * standby, we confirm that the standby has not been promoted during the
10196          * backup.
10197          */
10198         ptr = strstr(remaining, "BACKUP FROM:");
10199         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10200                 ereport(ERROR,
10201                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10202                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10203         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10204                 ereport(ERROR,
10205                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10206                                  errmsg("the standby was promoted during online backup"),
10207                                  errhint("This means that the backup being taken is corrupt "
10208                                                  "and should not be used. "
10209                                                  "Try taking another online backup.")));
10210
10211         /*
10212          * During recovery, we don't write an end-of-backup record. We assume that
10213          * pg_control was backed up last and its minimum recovery point can be
10214          * available as the backup end location. Since we don't have an
10215          * end-of-backup record, we use the pg_control value to check whether
10216          * we've reached the end of backup when starting recovery from this
10217          * backup. We have no way of checking if pg_control wasn't backed up last
10218          * however.
10219          *
10220          * We don't force a switch to new WAL file and wait for all the required
10221          * files to be archived. This is okay if we use the backup to start the
10222          * standby. But, if it's for an archive recovery, to ensure all the
10223          * required files are available, a user should wait for them to be
10224          * archived, or include them into the backup.
10225          *
10226          * We return the current minimum recovery point as the backup end
10227          * location. Note that it can be greater than the exact backup end
10228          * location if the minimum recovery point is updated after the backup of
10229          * pg_control. This is harmless for current uses.
10230          *
10231          * XXX currently a backup history file is for informational and debug
10232          * purposes only. It's not essential for an online backup. Furthermore,
10233          * even if it's created, it will not be archived during recovery because
10234          * an archiver is not invoked. So it doesn't seem worthwhile to write a
10235          * backup history file during recovery.
10236          */
10237         if (backup_started_in_recovery)
10238         {
10239                 /* use volatile pointer to prevent code rearrangement */
10240                 volatile XLogCtlData *xlogctl = XLogCtl;
10241                 XLogRecPtr      recptr;
10242
10243                 /*
10244                  * Check to see if all WAL replayed during online backup contain
10245                  * full-page writes.
10246                  */
10247                 SpinLockAcquire(&xlogctl->info_lck);
10248                 recptr = xlogctl->lastFpwDisableRecPtr;
10249                 SpinLockRelease(&xlogctl->info_lck);
10250
10251                 if (startpoint <= recptr)
10252                         ereport(ERROR,
10253                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10254                            errmsg("WAL generated with full_page_writes=off was replayed "
10255                                           "during online backup"),
10256                          errhint("This means that the backup being taken on the standby "
10257                                          "is corrupt and should not be used. "
10258                                  "Enable full_page_writes and run CHECKPOINT on the master, "
10259                                          "and then try an online backup again.")));
10260
10261
10262                 LWLockAcquire(ControlFileLock, LW_SHARED);
10263                 stoppoint = ControlFile->minRecoveryPoint;
10264                 stoptli = ControlFile->minRecoveryPointTLI;
10265                 LWLockRelease(ControlFileLock);
10266
10267                 if (stoptli_p)
10268                         *stoptli_p = stoptli;
10269                 return stoppoint;
10270         }
10271
10272         /*
10273          * Write the backup-end xlog record
10274          */
10275         rdata.data = (char *) (&startpoint);
10276         rdata.len = sizeof(startpoint);
10277         rdata.buffer = InvalidBuffer;
10278         rdata.next = NULL;
10279         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
10280         stoptli = ThisTimeLineID;
10281
10282         /*
10283          * Force a switch to a new xlog segment file, so that the backup is valid
10284          * as soon as archiver moves out the current segment file.
10285          */
10286         RequestXLogSwitch();
10287
10288         XLByteToPrevSeg(stoppoint, _logSegNo);
10289         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10290
10291         /* Use the log timezone here, not the session timezone */
10292         stamp_time = (pg_time_t) time(NULL);
10293         pg_strftime(strfbuf, sizeof(strfbuf),
10294                                 "%Y-%m-%d %H:%M:%S %Z",
10295                                 pg_localtime(&stamp_time, log_timezone));
10296
10297         /*
10298          * Write the backup history file
10299          */
10300         XLByteToSeg(startpoint, _logSegNo);
10301         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10302                                                   (uint32) (startpoint % XLogSegSize));
10303         fp = AllocateFile(histfilepath, "w");
10304         if (!fp)
10305                 ereport(ERROR,
10306                                 (errcode_for_file_access(),
10307                                  errmsg("could not create file \"%s\": %m",
10308                                                 histfilepath)));
10309         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10310                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10311         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10312                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10313         /* transfer remaining lines from label to history file */
10314         fprintf(fp, "%s", remaining);
10315         fprintf(fp, "STOP TIME: %s\n", strfbuf);
10316         if (fflush(fp) || ferror(fp) || FreeFile(fp))
10317                 ereport(ERROR,
10318                                 (errcode_for_file_access(),
10319                                  errmsg("could not write file \"%s\": %m",
10320                                                 histfilepath)));
10321
10322         /*
10323          * Clean out any no-longer-needed history files.  As a side effect, this
10324          * will post a .ready file for the newly created history file, notifying
10325          * the archiver that history file may be archived immediately.
10326          */
10327         CleanupBackupHistory();
10328
10329         /*
10330          * If archiving is enabled, wait for all the required WAL files to be
10331          * archived before returning. If archiving isn't enabled, the required WAL
10332          * needs to be transported via streaming replication (hopefully with
10333          * wal_keep_segments set high enough), or some more exotic mechanism like
10334          * polling and copying files from pg_xlog with script. We have no
10335          * knowledge of those mechanisms, so it's up to the user to ensure that he
10336          * gets all the required WAL.
10337          *
10338          * We wait until both the last WAL file filled during backup and the
10339          * history file have been archived, and assume that the alphabetic sorting
10340          * property of the WAL files ensures any earlier WAL files are safely
10341          * archived as well.
10342          *
10343          * We wait forever, since archive_command is supposed to work and we
10344          * assume the admin wanted his backup to work completely. If you don't
10345          * wish to wait, you can set statement_timeout.  Also, some notices are
10346          * issued to clue in anyone who might be doing this interactively.
10347          */
10348         if (waitforarchive && XLogArchivingActive())
10349         {
10350                 XLByteToPrevSeg(stoppoint, _logSegNo);
10351                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10352
10353                 XLByteToSeg(startpoint, _logSegNo);
10354                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10355                                                           (uint32) (startpoint % XLogSegSize));
10356
10357                 seconds_before_warning = 60;
10358                 waits = 0;
10359
10360                 while (XLogArchiveIsBusy(lastxlogfilename) ||
10361                            XLogArchiveIsBusy(histfilename))
10362                 {
10363                         CHECK_FOR_INTERRUPTS();
10364
10365                         if (!reported_waiting && waits > 5)
10366                         {
10367                                 ereport(NOTICE,
10368                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10369                                 reported_waiting = true;
10370                         }
10371
10372                         pg_usleep(1000000L);
10373
10374                         if (++waits >= seconds_before_warning)
10375                         {
10376                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
10377                                 ereport(WARNING,
10378                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10379                                                                 waits),
10380                                                  errhint("Check that your archive_command is executing properly.  "
10381                                                                  "pg_stop_backup can be canceled safely, "
10382                                                                  "but the database backup will not be usable without all the WAL segments.")));
10383                         }
10384                 }
10385
10386                 ereport(NOTICE,
10387                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
10388         }
10389         else if (waitforarchive)
10390                 ereport(NOTICE,
10391                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10392
10393         /*
10394          * We're done.  As a convenience, return the ending WAL location.
10395          */
10396         if (stoptli_p)
10397                 *stoptli_p = stoptli;
10398         return stoppoint;
10399 }
10400
10401
10402 /*
10403  * do_pg_abort_backup: abort a running backup
10404  *
10405  * This does just the most basic steps of do_pg_stop_backup(), by taking the
10406  * system out of backup mode, thus making it a lot more safe to call from
10407  * an error handler.
10408  *
10409  * NB: This is only for aborting a non-exclusive backup that doesn't write
10410  * backup_label. A backup started with pg_stop_backup() needs to be finished
10411  * with pg_stop_backup().
10412  */
10413 void
10414 do_pg_abort_backup(void)
10415 {
10416         WALInsertSlotAcquire(true);
10417         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10418         XLogCtl->Insert.nonExclusiveBackups--;
10419
10420         if (!XLogCtl->Insert.exclusiveBackup &&
10421                 XLogCtl->Insert.nonExclusiveBackups == 0)
10422         {
10423                 XLogCtl->Insert.forcePageWrites = false;
10424         }
10425         WALInsertSlotRelease();
10426 }
10427
10428 /*
10429  * Get latest redo apply position.
10430  *
10431  * Exported to allow WALReceiver to read the pointer directly.
10432  */
10433 XLogRecPtr
10434 GetXLogReplayRecPtr(TimeLineID *replayTLI)
10435 {
10436         /* use volatile pointer to prevent code rearrangement */
10437         volatile XLogCtlData *xlogctl = XLogCtl;
10438         XLogRecPtr      recptr;
10439         TimeLineID      tli;
10440
10441         SpinLockAcquire(&xlogctl->info_lck);
10442         recptr = xlogctl->lastReplayedEndRecPtr;
10443         tli = xlogctl->lastReplayedTLI;
10444         SpinLockRelease(&xlogctl->info_lck);
10445
10446         if (replayTLI)
10447                 *replayTLI = tli;
10448         return recptr;
10449 }
10450
10451 /*
10452  * Get latest WAL insert pointer
10453  */
10454 XLogRecPtr
10455 GetXLogInsertRecPtr(void)
10456 {
10457         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
10458         uint64          current_bytepos;
10459
10460         SpinLockAcquire(&Insert->insertpos_lck);
10461         current_bytepos = Insert->CurrBytePos;
10462         SpinLockRelease(&Insert->insertpos_lck);
10463
10464         return XLogBytePosToRecPtr(current_bytepos);
10465 }
10466
10467 /*
10468  * Get latest WAL write pointer
10469  */
10470 XLogRecPtr
10471 GetXLogWriteRecPtr(void)
10472 {
10473         {
10474                 /* use volatile pointer to prevent code rearrangement */
10475                 volatile XLogCtlData *xlogctl = XLogCtl;
10476
10477                 SpinLockAcquire(&xlogctl->info_lck);
10478                 LogwrtResult = xlogctl->LogwrtResult;
10479                 SpinLockRelease(&xlogctl->info_lck);
10480         }
10481
10482         return LogwrtResult.Write;
10483 }
10484
10485 /*
10486  * Returns the redo pointer of the last checkpoint or restartpoint. This is
10487  * the oldest point in WAL that we still need, if we have to restart recovery.
10488  */
10489 void
10490 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
10491 {
10492         LWLockAcquire(ControlFileLock, LW_SHARED);
10493         *oldrecptr = ControlFile->checkPointCopy.redo;
10494         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
10495         LWLockRelease(ControlFileLock);
10496 }
10497
10498 /*
10499  * read_backup_label: check to see if a backup_label file is present
10500  *
10501  * If we see a backup_label during recovery, we assume that we are recovering
10502  * from a backup dump file, and we therefore roll forward from the checkpoint
10503  * identified by the label file, NOT what pg_control says.      This avoids the
10504  * problem that pg_control might have been archived one or more checkpoints
10505  * later than the start of the dump, and so if we rely on it as the start
10506  * point, we will fail to restore a consistent database state.
10507  *
10508  * Returns TRUE if a backup_label was found (and fills the checkpoint
10509  * location and its REDO location into *checkPointLoc and RedoStartLSN,
10510  * respectively); returns FALSE if not. If this backup_label came from a
10511  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
10512  * was created during recovery, *backupFromStandby is set to TRUE.
10513  */
10514 static bool
10515 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
10516                                   bool *backupFromStandby)
10517 {
10518         char            startxlogfilename[MAXFNAMELEN];
10519         TimeLineID      tli;
10520         FILE       *lfp;
10521         char            ch;
10522         char            backuptype[20];
10523         char            backupfrom[20];
10524         uint32          hi,
10525                                 lo;
10526
10527         *backupEndRequired = false;
10528         *backupFromStandby = false;
10529
10530         /*
10531          * See if label file is present
10532          */
10533         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10534         if (!lfp)
10535         {
10536                 if (errno != ENOENT)
10537                         ereport(FATAL,
10538                                         (errcode_for_file_access(),
10539                                          errmsg("could not read file \"%s\": %m",
10540                                                         BACKUP_LABEL_FILE)));
10541                 return false;                   /* it's not there, all is fine */
10542         }
10543
10544         /*
10545          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
10546          * is pretty crude, but we are not expecting any variability in the file
10547          * format).
10548          */
10549         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
10550                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
10551                 ereport(FATAL,
10552                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10553                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10554         RedoStartLSN = ((uint64) hi) << 32 | lo;
10555         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
10556                            &hi, &lo, &ch) != 3 || ch != '\n')
10557                 ereport(FATAL,
10558                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10559                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10560         *checkPointLoc = ((uint64) hi) << 32 | lo;
10561
10562         /*
10563          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
10564          * from an older backup anyway, but since the information on it is not
10565          * strictly required, don't error out if it's missing for some reason.
10566          */
10567         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
10568         {
10569                 if (strcmp(backuptype, "streamed") == 0)
10570                         *backupEndRequired = true;
10571         }
10572
10573         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
10574         {
10575                 if (strcmp(backupfrom, "standby") == 0)
10576                         *backupFromStandby = true;
10577         }
10578
10579         if (ferror(lfp) || FreeFile(lfp))
10580                 ereport(FATAL,
10581                                 (errcode_for_file_access(),
10582                                  errmsg("could not read file \"%s\": %m",
10583                                                 BACKUP_LABEL_FILE)));
10584
10585         return true;
10586 }
10587
10588 /*
10589  * Error context callback for errors occurring during rm_redo().
10590  */
10591 static void
10592 rm_redo_error_callback(void *arg)
10593 {
10594         XLogRecord *record = (XLogRecord *) arg;
10595         StringInfoData buf;
10596
10597         initStringInfo(&buf);
10598         RmgrTable[record->xl_rmid].rm_desc(&buf,
10599                                                                            record->xl_info,
10600                                                                            XLogRecGetData(record));
10601
10602         /* don't bother emitting empty description */
10603         if (buf.len > 0)
10604                 errcontext("xlog redo %s", buf.data);
10605
10606         pfree(buf.data);
10607 }
10608
10609 /*
10610  * BackupInProgress: check if online backup mode is active
10611  *
10612  * This is done by checking for existence of the "backup_label" file.
10613  */
10614 bool
10615 BackupInProgress(void)
10616 {
10617         struct stat stat_buf;
10618
10619         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
10620 }
10621
10622 /*
10623  * CancelBackup: rename the "backup_label" file to cancel backup mode
10624  *
10625  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
10626  * Note that this will render an online backup in progress useless.
10627  * To correctly finish an online backup, pg_stop_backup must be called.
10628  */
10629 void
10630 CancelBackup(void)
10631 {
10632         struct stat stat_buf;
10633
10634         /* if the file is not there, return */
10635         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
10636                 return;
10637
10638         /* remove leftover file from previously canceled backup if it exists */
10639         unlink(BACKUP_LABEL_OLD);
10640
10641         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
10642         {
10643                 ereport(LOG,
10644                                 (errmsg("online backup mode canceled"),
10645                                  errdetail("\"%s\" was renamed to \"%s\".",
10646                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10647         }
10648         else
10649         {
10650                 ereport(WARNING,
10651                                 (errcode_for_file_access(),
10652                                  errmsg("online backup mode was not canceled"),
10653                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
10654                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10655         }
10656 }
10657
10658 /*
10659  * Read the XLOG page containing RecPtr into readBuf (if not read already).
10660  * Returns number of bytes read, if the page is read successfully, or -1
10661  * in case of errors.  When errors occur, they are ereport'ed, but only
10662  * if they have not been previously reported.
10663  *
10664  * This is responsible for restoring files from archive as needed, as well
10665  * as for waiting for the requested WAL record to arrive in standby mode.
10666  *
10667  * 'emode' specifies the log level used for reporting "file not found" or
10668  * "end of WAL" situations in archive recovery, or in standby mode when a
10669  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10670  * false in those situations, on higher log levels the ereport() won't
10671  * return.
10672  *
10673  * In standby mode, if after a successful return of XLogPageRead() the
10674  * caller finds the record it's interested in to be broken, it should
10675  * ereport the error with the level determined by
10676  * emode_for_corrupt_record(), and then set lastSourceFailed
10677  * and call XLogPageRead() again with the same arguments. This lets
10678  * XLogPageRead() to try fetching the record from another source, or to
10679  * sleep and retry.
10680  */
10681 static int
10682 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
10683                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
10684 {
10685         XLogPageReadPrivate *private =
10686         (XLogPageReadPrivate *) xlogreader->private_data;
10687         int                     emode = private->emode;
10688         uint32          targetPageOff;
10689         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
10690
10691         XLByteToSeg(targetPagePtr, targetSegNo);
10692         targetPageOff = targetPagePtr % XLogSegSize;
10693
10694         /*
10695          * See if we need to switch to a new segment because the requested record
10696          * is not in the currently open one.
10697          */
10698         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
10699         {
10700                 /*
10701                  * Request a restartpoint if we've replayed too much xlog since the
10702                  * last one.
10703                  */
10704                 if (StandbyModeRequested && bgwriterLaunched)
10705                 {
10706                         if (XLogCheckpointNeeded(readSegNo))
10707                         {
10708                                 (void) GetRedoRecPtr();
10709                                 if (XLogCheckpointNeeded(readSegNo))
10710                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10711                         }
10712                 }
10713
10714                 close(readFile);
10715                 readFile = -1;
10716                 readSource = 0;
10717         }
10718
10719         XLByteToSeg(targetPagePtr, readSegNo);
10720
10721 retry:
10722         /* See if we need to retrieve more data */
10723         if (readFile < 0 ||
10724                 (readSource == XLOG_FROM_STREAM &&
10725                  receivedUpto < targetPagePtr + reqLen))
10726         {
10727                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
10728                                                                                  private->randAccess,
10729                                                                                  private->fetching_ckpt,
10730                                                                                  targetRecPtr))
10731                 {
10732                         if (readFile >= 0)
10733                                 close(readFile);
10734                         readFile = -1;
10735                         readLen = 0;
10736                         readSource = 0;
10737
10738                         return -1;
10739                 }
10740         }
10741
10742         /*
10743          * At this point, we have the right segment open and if we're streaming we
10744          * know the requested record is in it.
10745          */
10746         Assert(readFile != -1);
10747
10748         /*
10749          * If the current segment is being streamed from master, calculate how
10750          * much of the current page we have received already. We know the
10751          * requested record has been received, but this is for the benefit of
10752          * future calls, to allow quick exit at the top of this function.
10753          */
10754         if (readSource == XLOG_FROM_STREAM)
10755         {
10756                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
10757                         readLen = XLOG_BLCKSZ;
10758                 else
10759                         readLen = receivedUpto % XLogSegSize - targetPageOff;
10760         }
10761         else
10762                 readLen = XLOG_BLCKSZ;
10763
10764         /* Read the requested page */
10765         readOff = targetPageOff;
10766         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10767         {
10768                 char            fname[MAXFNAMELEN];
10769
10770                 XLogFileName(fname, curFileTLI, readSegNo);
10771                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10772                                 (errcode_for_file_access(),
10773                                  errmsg("could not seek in log segment %s to offset %u: %m",
10774                                                 fname, readOff)));
10775                 goto next_record_is_invalid;
10776         }
10777
10778         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10779         {
10780                 char            fname[MAXFNAMELEN];
10781
10782                 XLogFileName(fname, curFileTLI, readSegNo);
10783                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10784                                 (errcode_for_file_access(),
10785                                  errmsg("could not read from log segment %s, offset %u: %m",
10786                                                 fname, readOff)));
10787                 goto next_record_is_invalid;
10788         }
10789
10790         Assert(targetSegNo == readSegNo);
10791         Assert(targetPageOff == readOff);
10792         Assert(reqLen <= readLen);
10793
10794         *readTLI = curFileTLI;
10795         return readLen;
10796
10797 next_record_is_invalid:
10798         lastSourceFailed = true;
10799
10800         if (readFile >= 0)
10801                 close(readFile);
10802         readFile = -1;
10803         readLen = 0;
10804         readSource = 0;
10805
10806         /* In standby-mode, keep trying */
10807         if (StandbyMode)
10808                 goto retry;
10809         else
10810                 return -1;
10811 }
10812
10813 /*
10814  * Open the WAL segment containing WAL position 'RecPtr'.
10815  *
10816  * The segment can be fetched via restore_command, or via walreceiver having
10817  * streamed the record, or it can already be present in pg_xlog. Checking
10818  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
10819  * too, in case someone copies a new segment directly to pg_xlog. That is not
10820  * documented or recommended, though.
10821  *
10822  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
10823  * prepare to read WAL starting from RedoStartLSN after this.
10824  *
10825  * 'RecPtr' might not point to the beginning of the record we're interested
10826  * in, it might also point to the page or segment header. In that case,
10827  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
10828  * used to decide which timeline to stream the requested WAL from.
10829  *
10830  * If the record is not immediately available, the function returns false
10831  * if we're not in standby mode. In standby mode, waits for it to become
10832  * available.
10833  *
10834  * When the requested record becomes available, the function opens the file
10835  * containing it (if not open already), and returns true. When end of standby
10836  * mode is triggered by the user, and there is no more WAL available, returns
10837  * false.
10838  */
10839 static bool
10840 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
10841                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
10842 {
10843         static pg_time_t last_fail_time = 0;
10844         pg_time_t       now;
10845
10846         /*-------
10847          * Standby mode is implemented by a state machine:
10848          *
10849          * 1. Read from archive (XLOG_FROM_ARCHIVE)
10850          * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
10851          * 3. Check trigger file
10852          * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
10853          * 5. Rescan timelines
10854          * 6. Sleep 5 seconds, and loop back to 1.
10855          *
10856          * Failure to read from the current source advances the state machine to
10857          * the next state. In addition, successfully reading a file from pg_xlog
10858          * moves the state machine from state 2 back to state 1 (we always prefer
10859          * files in the archive over files in pg_xlog).
10860          *
10861          * 'currentSource' indicates the current state. There are no currentSource
10862          * values for "check trigger", "rescan timelines", and "sleep" states,
10863          * those actions are taken when reading from the previous source fails, as
10864          * part of advancing to the next state.
10865          *-------
10866          */
10867         if (!InArchiveRecovery)
10868                 currentSource = XLOG_FROM_PG_XLOG;
10869         else if (currentSource == 0)
10870                 currentSource = XLOG_FROM_ARCHIVE;
10871
10872         for (;;)
10873         {
10874                 int                     oldSource = currentSource;
10875
10876                 /*
10877                  * First check if we failed to read from the current source, and
10878                  * advance the state machine if so. The failure to read might've
10879                  * happened outside this function, e.g when a CRC check fails on a
10880                  * record, or within this loop.
10881                  */
10882                 if (lastSourceFailed)
10883                 {
10884                         switch (currentSource)
10885                         {
10886                                 case XLOG_FROM_ARCHIVE:
10887                                         currentSource = XLOG_FROM_PG_XLOG;
10888                                         break;
10889
10890                                 case XLOG_FROM_PG_XLOG:
10891
10892                                         /*
10893                                          * Check to see if the trigger file exists. Note that we
10894                                          * do this only after failure, so when you create the
10895                                          * trigger file, we still finish replaying as much as we
10896                                          * can from archive and pg_xlog before failover.
10897                                          */
10898                                         if (StandbyMode && CheckForStandbyTrigger())
10899                                         {
10900                                                 ShutdownWalRcv();
10901                                                 return false;
10902                                         }
10903
10904                                         /*
10905                                          * Not in standby mode, and we've now tried the archive
10906                                          * and pg_xlog.
10907                                          */
10908                                         if (!StandbyMode)
10909                                                 return false;
10910
10911                                         /*
10912                                          * If primary_conninfo is set, launch walreceiver to try
10913                                          * to stream the missing WAL.
10914                                          *
10915                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
10916                                          * checkpoint location. In that case, we use RedoStartLSN
10917                                          * as the streaming start position instead of RecPtr, so
10918                                          * that when we later jump backwards to start redo at
10919                                          * RedoStartLSN, we will have the logs streamed already.
10920                                          */
10921                                         if (PrimaryConnInfo)
10922                                         {
10923                                                 XLogRecPtr      ptr;
10924                                                 TimeLineID      tli;
10925
10926                                                 if (fetching_ckpt)
10927                                                 {
10928                                                         ptr = RedoStartLSN;
10929                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
10930                                                 }
10931                                                 else
10932                                                 {
10933                                                         ptr = tliRecPtr;
10934                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
10935
10936                                                         if (curFileTLI > 0 && tli < curFileTLI)
10937                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
10938                                                                          (uint32) (ptr >> 32), (uint32) ptr,
10939                                                                          tli, curFileTLI);
10940                                                 }
10941                                                 curFileTLI = tli;
10942                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo);
10943                                                 receivedUpto = 0;
10944                                         }
10945
10946                                         /*
10947                                          * Move to XLOG_FROM_STREAM state in either case. We'll
10948                                          * get immediate failure if we didn't launch walreceiver,
10949                                          * and move on to the next state.
10950                                          */
10951                                         currentSource = XLOG_FROM_STREAM;
10952                                         break;
10953
10954                                 case XLOG_FROM_STREAM:
10955
10956                                         /*
10957                                          * Failure while streaming. Most likely, we got here
10958                                          * because streaming replication was terminated, or
10959                                          * promotion was triggered. But we also get here if we
10960                                          * find an invalid record in the WAL streamed from master,
10961                                          * in which case something is seriously wrong. There's
10962                                          * little chance that the problem will just go away, but
10963                                          * PANIC is not good for availability either, especially
10964                                          * in hot standby mode. So, we treat that the same as
10965                                          * disconnection, and retry from archive/pg_xlog again.
10966                                          * The WAL in the archive should be identical to what was
10967                                          * streamed, so it's unlikely that it helps, but one can
10968                                          * hope...
10969                                          */
10970
10971                                         /*
10972                                          * Before we leave XLOG_FROM_STREAM state, make sure that
10973                                          * walreceiver is not active, so that it won't overwrite
10974                                          * WAL that we restore from archive.
10975                                          */
10976                                         if (WalRcvStreaming())
10977                                                 ShutdownWalRcv();
10978
10979                                         /*
10980                                          * Before we sleep, re-scan for possible new timelines if
10981                                          * we were requested to recover to the latest timeline.
10982                                          */
10983                                         if (recoveryTargetIsLatest)
10984                                         {
10985                                                 if (rescanLatestTimeLine())
10986                                                 {
10987                                                         currentSource = XLOG_FROM_ARCHIVE;
10988                                                         break;
10989                                                 }
10990                                         }
10991
10992                                         /*
10993                                          * XLOG_FROM_STREAM is the last state in our state
10994                                          * machine, so we've exhausted all the options for
10995                                          * obtaining the requested WAL. We're going to loop back
10996                                          * and retry from the archive, but if it hasn't been long
10997                                          * since last attempt, sleep 5 seconds to avoid
10998                                          * busy-waiting.
10999                                          */
11000                                         now = (pg_time_t) time(NULL);
11001                                         if ((now - last_fail_time) < 5)
11002                                         {
11003                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
11004                                                 now = (pg_time_t) time(NULL);
11005                                         }
11006                                         last_fail_time = now;
11007                                         currentSource = XLOG_FROM_ARCHIVE;
11008                                         break;
11009
11010                                 default:
11011                                         elog(ERROR, "unexpected WAL source %d", currentSource);
11012                         }
11013                 }
11014                 else if (currentSource == XLOG_FROM_PG_XLOG)
11015                 {
11016                         /*
11017                          * We just successfully read a file in pg_xlog. We prefer files in
11018                          * the archive over ones in pg_xlog, so try the next file again
11019                          * from the archive first.
11020                          */
11021                         if (InArchiveRecovery)
11022                                 currentSource = XLOG_FROM_ARCHIVE;
11023                 }
11024
11025                 if (currentSource != oldSource)
11026                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
11027                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
11028                                  lastSourceFailed ? "failure" : "success");
11029
11030                 /*
11031                  * We've now handled possible failure. Try to read from the chosen
11032                  * source.
11033                  */
11034                 lastSourceFailed = false;
11035
11036                 switch (currentSource)
11037                 {
11038                         case XLOG_FROM_ARCHIVE:
11039                         case XLOG_FROM_PG_XLOG:
11040                                 /* Close any old file we might have open. */
11041                                 if (readFile >= 0)
11042                                 {
11043                                         close(readFile);
11044                                         readFile = -1;
11045                                 }
11046                                 /* Reset curFileTLI if random fetch. */
11047                                 if (randAccess)
11048                                         curFileTLI = 0;
11049
11050                                 /*
11051                                  * Try to restore the file from archive, or read an existing
11052                                  * file from pg_xlog.
11053                                  */
11054                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
11055                                 if (readFile >= 0)
11056                                         return true;    /* success! */
11057
11058                                 /*
11059                                  * Nope, not found in archive or pg_xlog.
11060                                  */
11061                                 lastSourceFailed = true;
11062                                 break;
11063
11064                         case XLOG_FROM_STREAM:
11065                                 {
11066                                         bool            havedata;
11067
11068                                         /*
11069                                          * Check if WAL receiver is still active.
11070                                          */
11071                                         if (!WalRcvStreaming())
11072                                         {
11073                                                 lastSourceFailed = true;
11074                                                 break;
11075                                         }
11076
11077                                         /*
11078                                          * Walreceiver is active, so see if new data has arrived.
11079                                          *
11080                                          * We only advance XLogReceiptTime when we obtain fresh
11081                                          * WAL from walreceiver and observe that we had already
11082                                          * processed everything before the most recent "chunk"
11083                                          * that it flushed to disk.  In steady state where we are
11084                                          * keeping up with the incoming data, XLogReceiptTime will
11085                                          * be updated on each cycle. When we are behind,
11086                                          * XLogReceiptTime will not advance, so the grace time
11087                                          * allotted to conflicting queries will decrease.
11088                                          */
11089                                         if (RecPtr < receivedUpto)
11090                                                 havedata = true;
11091                                         else
11092                                         {
11093                                                 XLogRecPtr      latestChunkStart;
11094
11095                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
11096                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
11097                                                 {
11098                                                         havedata = true;
11099                                                         if (latestChunkStart <= RecPtr)
11100                                                         {
11101                                                                 XLogReceiptTime = GetCurrentTimestamp();
11102                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
11103                                                         }
11104                                                 }
11105                                                 else
11106                                                         havedata = false;
11107                                         }
11108                                         if (havedata)
11109                                         {
11110                                                 /*
11111                                                  * Great, streamed far enough.  Open the file if it's
11112                                                  * not open already.  Also read the timeline history
11113                                                  * file if we haven't initialized timeline history
11114                                                  * yet; it should be streamed over and present in
11115                                                  * pg_xlog by now.      Use XLOG_FROM_STREAM so that
11116                                                  * source info is set correctly and XLogReceiptTime
11117                                                  * isn't changed.
11118                                                  */
11119                                                 if (readFile < 0)
11120                                                 {
11121                                                         if (!expectedTLEs)
11122                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
11123                                                         readFile = XLogFileRead(readSegNo, PANIC,
11124                                                                                                         receiveTLI,
11125                                                                                                         XLOG_FROM_STREAM, false);
11126                                                         Assert(readFile >= 0);
11127                                                 }
11128                                                 else
11129                                                 {
11130                                                         /* just make sure source info is correct... */
11131                                                         readSource = XLOG_FROM_STREAM;
11132                                                         XLogReceiptSource = XLOG_FROM_STREAM;
11133                                                         return true;
11134                                                 }
11135                                                 break;
11136                                         }
11137
11138                                         /*
11139                                          * Data not here yet. Check for trigger, then wait for
11140                                          * walreceiver to wake us up when new WAL arrives.
11141                                          */
11142                                         if (CheckForStandbyTrigger())
11143                                         {
11144                                                 /*
11145                                                  * Note that we don't "return false" immediately here.
11146                                                  * After being triggered, we still want to replay all
11147                                                  * the WAL that was already streamed. It's in pg_xlog
11148                                                  * now, so we just treat this as a failure, and the
11149                                                  * state machine will move on to replay the streamed
11150                                                  * WAL from pg_xlog, and then recheck the trigger and
11151                                                  * exit replay.
11152                                                  */
11153                                                 lastSourceFailed = true;
11154                                                 break;
11155                                         }
11156
11157                                         /*
11158                                          * Wait for more WAL to arrive. Time out after 5 seconds,
11159                                          * like when polling the archive, to react to a trigger
11160                                          * file promptly.
11161                                          */
11162                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
11163                                                           WL_LATCH_SET | WL_TIMEOUT,
11164                                                           5000L);
11165                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
11166                                         break;
11167                                 }
11168
11169                         default:
11170                                 elog(ERROR, "unexpected WAL source %d", currentSource);
11171                 }
11172
11173                 /*
11174                  * This possibly-long loop needs to handle interrupts of startup
11175                  * process.
11176                  */
11177                 HandleStartupProcInterrupts();
11178         } while (StandbyMode);
11179
11180         return false;
11181 }
11182
11183 /*
11184  * Determine what log level should be used to report a corrupt WAL record
11185  * in the current WAL page, previously read by XLogPageRead().
11186  *
11187  * 'emode' is the error mode that would be used to report a file-not-found
11188  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
11189  * we're retrying the exact same record that we've tried previously, only
11190  * complain the first time to keep the noise down.      However, we only do when
11191  * reading from pg_xlog, because we don't expect any invalid records in archive
11192  * or in records streamed from master. Files in the archive should be complete,
11193  * and we should never hit the end of WAL because we stop and wait for more WAL
11194  * to arrive before replaying it.
11195  *
11196  * NOTE: This function remembers the RecPtr value it was last called with,
11197  * to suppress repeated messages about the same record. Only call this when
11198  * you are about to ereport(), or you might cause a later message to be
11199  * erroneously suppressed.
11200  */
11201 static int
11202 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
11203 {
11204         static XLogRecPtr lastComplaint = 0;
11205
11206         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
11207         {
11208                 if (RecPtr == lastComplaint)
11209                         emode = DEBUG1;
11210                 else
11211                         lastComplaint = RecPtr;
11212         }
11213         return emode;
11214 }
11215
11216 /*
11217  * Check to see whether the user-specified trigger file exists and whether a
11218  * promote request has arrived.  If either condition holds, return true.
11219  */
11220 static bool
11221 CheckForStandbyTrigger(void)
11222 {
11223         struct stat stat_buf;
11224         static bool triggered = false;
11225
11226         if (triggered)
11227                 return true;
11228
11229         if (IsPromoteTriggered())
11230         {
11231                 /*
11232                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
11233                  * signal handler. It now leaves the file in place and lets the
11234                  * Startup process do the unlink. This allows Startup to know whether
11235                  * it should create a full checkpoint before starting up (fallback
11236                  * mode). Fast promotion takes precedence.
11237                  */
11238                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11239                 {
11240                         unlink(PROMOTE_SIGNAL_FILE);
11241                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11242                         fast_promote = true;
11243                 }
11244                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11245                 {
11246                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11247                         fast_promote = false;
11248                 }
11249
11250                 ereport(LOG, (errmsg("received promote request")));
11251
11252                 ResetPromoteTriggered();
11253                 triggered = true;
11254                 return true;
11255         }
11256
11257         if (TriggerFile == NULL)
11258                 return false;
11259
11260         if (stat(TriggerFile, &stat_buf) == 0)
11261         {
11262                 ereport(LOG,
11263                                 (errmsg("trigger file found: %s", TriggerFile)));
11264                 unlink(TriggerFile);
11265                 triggered = true;
11266                 fast_promote = true;
11267                 return true;
11268         }
11269         return false;
11270 }
11271
11272 /*
11273  * Check to see if a promote request has arrived. Should be
11274  * called by postmaster after receiving SIGUSR1.
11275  */
11276 bool
11277 CheckPromoteSignal(void)
11278 {
11279         struct stat stat_buf;
11280
11281         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
11282                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11283                 return true;
11284
11285         return false;
11286 }
11287
11288 /*
11289  * Wake up startup process to replay newly arrived WAL, or to notice that
11290  * failover has been requested.
11291  */
11292 void
11293 WakeupRecovery(void)
11294 {
11295         SetLatch(&XLogCtl->recoveryWakeupLatch);
11296 }
11297
11298 /*
11299  * Update the WalWriterSleeping flag.
11300  */
11301 void
11302 SetWalWriterSleeping(bool sleeping)
11303 {
11304         /* use volatile pointer to prevent code rearrangement */
11305         volatile XLogCtlData *xlogctl = XLogCtl;
11306
11307         SpinLockAcquire(&xlogctl->info_lck);
11308         xlogctl->WalWriterSleeping = sleeping;
11309         SpinLockRelease(&xlogctl->info_lck);
11310 }