granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <time.h>
  19 #include <fcntl.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <unistd.h>
  23
  24 #include "access/clog.h"
  25 #include "access/multixact.h"
  26 #include "access/subtrans.h"
  27 #include "access/timeline.h"
  28 #include "access/transam.h"
  29 #include "access/tuptoaster.h"
  30 #include "access/twophase.h"
  31 #include "access/xact.h"
  32 #include "access/xlog_internal.h"
  33 #include "access/xlogreader.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_database.h"
  38 #include "miscadmin.h"
  39 #include "pgstat.h"
  40 #include "postmaster/bgwriter.h"
  41 #include "postmaster/startup.h"
  42 #include "replication/walreceiver.h"
  43 #include "replication/walsender.h"
  44 #include "storage/barrier.h"
  45 #include "storage/bufmgr.h"
  46 #include "storage/fd.h"
  47 #include "storage/ipc.h"
  48 #include "storage/latch.h"
  49 #include "storage/pmsignal.h"
  50 #include "storage/predicate.h"
  51 #include "storage/proc.h"
  52 #include "storage/procarray.h"
  53 #include "storage/reinit.h"
  54 #include "storage/smgr.h"
  55 #include "storage/spin.h"
  56 #include "utils/builtins.h"
  57 #include "utils/guc.h"
  58 #include "utils/ps_status.h"
  59 #include "utils/relmapper.h"
  60 #include "utils/snapmgr.h"
  61 #include "utils/timestamp.h"
  62 #include "pg_trace.h"
  63
  64 extern uint32 bootstrap_data_checksum_version;
  65
  66 /* File path names (all relative to $PGDATA) */
  67 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  68 #define RECOVERY_COMMAND_DONE   "recovery.done"
  69 #define PROMOTE_SIGNAL_FILE             "promote"
  70 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
  71
  72
  73 /* User-settable parameters */
  74 int                     CheckPointSegments = 3;
  75 int                     wal_keep_segments = 0;
  76 int                     XLOGbuffers = -1;
  77 int                     XLogArchiveTimeout = 0;
  78 bool            XLogArchiveMode = false;
  79 char       *XLogArchiveCommand = NULL;
  80 bool            EnableHotStandby = false;
  81 bool            fullPageWrites = true;
  82 bool            log_checkpoints = false;
  83 int                     sync_method = DEFAULT_SYNC_METHOD;
  84 int                     wal_level = WAL_LEVEL_MINIMAL;
  85 int                     CommitDelay = 0;        /* precommit delay in microseconds */
  86 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  87 int                     num_xloginsert_slots = 8;
  88
  89 #ifdef WAL_DEBUG
  90 bool            XLOG_DEBUG = false;
  91 #endif
  92
  93 /*
  94  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  95  * When we are done with an old XLOG segment file, we will recycle it as a
  96  * future XLOG segment as long as there aren't already XLOGfileslop future
  97  * segments; else we'll delete it.  This could be made a separate GUC
  98  * variable, but at present I think it's sufficient to hardwire it as
  99  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
 100  * no more than 2*CheckPointSegments log segments, and we want to recycle all
 101  * of them; the +1 allows boundary cases to happen without wasting a
 102  * delete/create-segment cycle.
 103  */
 104 #define XLOGfileslop    (2*CheckPointSegments + 1)
 105
 106
 107 /*
 108  * GUC support
 109  */
 110 const struct config_enum_entry sync_method_options[] = {
 111         {"fsync", SYNC_METHOD_FSYNC, false},
 112 #ifdef HAVE_FSYNC_WRITETHROUGH
 113         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 114 #endif
 115 #ifdef HAVE_FDATASYNC
 116         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 117 #endif
 118 #ifdef OPEN_SYNC_FLAG
 119         {"open_sync", SYNC_METHOD_OPEN, false},
 120 #endif
 121 #ifdef OPEN_DATASYNC_FLAG
 122         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 123 #endif
 124         {NULL, 0, false}
 125 };
 126
 127 /*
 128  * Statistics for current checkpoint are collected in this global struct.
 129  * Because only the background writer or a stand-alone backend can perform
 130  * checkpoints, this will be unused in normal backends.
 131  */
 132 CheckpointStatsData CheckpointStats;
 133
 134 /*
 135  * ThisTimeLineID will be same in all backends --- it identifies current
 136  * WAL timeline for the database system.
 137  */
 138 TimeLineID      ThisTimeLineID = 0;
 139
 140 /*
 141  * Are we doing recovery from XLOG?
 142  *
 143  * This is only ever true in the startup process; it should be read as meaning
 144  * "this process is replaying WAL records", rather than "the system is in
 145  * recovery mode".  It should be examined primarily by functions that need
 146  * to act differently when called from a WAL redo function (e.g., to skip WAL
 147  * logging).  To check whether the system is in recovery regardless of which
 148  * process you're running in, use RecoveryInProgress() but only after shared
 149  * memory startup and lock initialization.
 150  */
 151 bool            InRecovery = false;
 152
 153 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 154 HotStandbyState standbyState = STANDBY_DISABLED;
 155
 156 static XLogRecPtr LastRec;
 157
 158 /* Local copy of WalRcv->receivedUpto */
 159 static XLogRecPtr receivedUpto = 0;
 160 static TimeLineID receiveTLI = 0;
 161
 162 /*
 163  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 164  * the replayed WAL records indicate. It's initialized with full_page_writes
 165  * that the recovery starting checkpoint record indicates, and then updated
 166  * each time XLOG_FPW_CHANGE record is replayed.
 167  */
 168 static bool lastFullPageWrites;
 169
 170 /*
 171  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 172  * known, need to check the shared state".
 173  */
 174 static bool LocalRecoveryInProgress = true;
 175
 176 /*
 177  * Local copy of SharedHotStandbyActive variable. False actually means "not
 178  * known, need to check the shared state".
 179  */
 180 static bool LocalHotStandbyActive = false;
 181
 182 /*
 183  * Local state for XLogInsertAllowed():
 184  *              1: unconditionally allowed to insert XLOG
 185  *              0: unconditionally not allowed to insert XLOG
 186  *              -1: must check RecoveryInProgress(); disallow until it is false
 187  * Most processes start with -1 and transition to 1 after seeing that recovery
 188  * is not in progress.  But we can also force the value for special cases.
 189  * The coding in XLogInsertAllowed() depends on the first two of these states
 190  * being numerically the same as bool true and false.
 191  */
 192 static int      LocalXLogInsertAllowed = -1;
 193
 194 /*
 195  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 196  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 197  * currently recovering using offline XLOG archives. These variables are only
 198  * valid in the startup process.
 199  *
 200  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 201  * currently performing crash recovery using only XLOG files in pg_xlog, but
 202  * will switch to using offline XLOG archives as soon as we reach the end of
 203  * WAL in pg_xlog.
 204 */
 205 bool            ArchiveRecoveryRequested = false;
 206 bool            InArchiveRecovery = false;
 207
 208 /* Was the last xlog file restored from archive, or local? */
 209 static bool restoredFromArchive = false;
 210
 211 /* options taken from recovery.conf for archive recovery */
 212 char       *recoveryRestoreCommand = NULL;
 213 static char *recoveryEndCommand = NULL;
 214 static char *archiveCleanupCommand = NULL;
 215 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 216 static bool recoveryTargetInclusive = true;
 217 static bool recoveryPauseAtTarget = true;
 218 static TransactionId recoveryTargetXid;
 219 static TimestampTz recoveryTargetTime;
 220 static char *recoveryTargetName;
 221
 222 /* options taken from recovery.conf for XLOG streaming */
 223 static bool StandbyModeRequested = false;
 224 static char *PrimaryConnInfo = NULL;
 225 static char *TriggerFile = NULL;
 226
 227 /* are we currently in standby mode? */
 228 bool            StandbyMode = false;
 229
 230 /* whether request for fast promotion has been made yet */
 231 static bool fast_promote = false;
 232
 233 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
 234 static TransactionId recoveryStopXid;
 235 static TimestampTz recoveryStopTime;
 236 static char recoveryStopName[MAXFNAMELEN];
 237 static bool recoveryStopAfter;
 238
 239 /*
 240  * During normal operation, the only timeline we care about is ThisTimeLineID.
 241  * During recovery, however, things are more complicated.  To simplify life
 242  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 243  * scan through the WAL history (that is, it is the line that was active when
 244  * the currently-scanned WAL record was generated).  We also need these
 245  * timeline values:
 246  *
 247  * recoveryTargetTLI: the desired timeline that we want to end in.
 248  *
 249  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 250  *
 251  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 252  * its known parents, newest first (so recoveryTargetTLI is always the
 253  * first list member).  Only these TLIs are expected to be seen in the WAL
 254  * segments we read, and indeed only these TLIs will be considered as
 255  * candidate WAL files to open at all.
 256  *
 257  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 258  * (This is not necessarily the same as ThisTimeLineID, because we could
 259  * be scanning data that was copied from an ancestor timeline when the current
 260  * file was created.)  During a sequential scan we do not allow this value
 261  * to decrease.
 262  */
 263 static TimeLineID recoveryTargetTLI;
 264 static bool recoveryTargetIsLatest = false;
 265 static List *expectedTLEs;
 266 static TimeLineID curFileTLI;
 267
 268 /*
 269  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 270  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 271  * end+1 of the last record, and is reset when we end a top-level transaction,
 272  * or start a new one; so it can be used to tell if the current transaction has
 273  * created any XLOG records.
 274  */
 275 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
 276
 277 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 278
 279 /*
 280  * RedoRecPtr is this backend's local copy of the REDO record pointer
 281  * (which is almost but not quite the same as a pointer to the most recent
 282  * CHECKPOINT record).  We update this from the shared-memory copy,
 283  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 284  * hold an insertion slot).  See XLogInsert for details.  We are also allowed
 285  * to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 286  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 287  * InitXLOGAccess.
 288  */
 289 static XLogRecPtr RedoRecPtr;
 290
 291 /*
 292  * RedoStartLSN points to the checkpoint's REDO location which is specified
 293  * in a backup label file, backup history file or control file. In standby
 294  * mode, XLOG streaming usually starts from the position where an invalid
 295  * record was found. But if we fail to read even the initial checkpoint
 296  * record, we use the REDO location instead of the checkpoint location as
 297  * the start position of XLOG streaming. Otherwise we would have to jump
 298  * backwards to the REDO location after reading the checkpoint record,
 299  * because the REDO record can precede the checkpoint record.
 300  */
 301 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 302
 303 /*----------
 304  * Shared-memory data structures for XLOG control
 305  *
 306  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 307  * the log up to (all records before that point must be written or fsynced).
 308  * LogwrtResult indicates the byte positions we have already written/fsynced.
 309  * These structs are identical but are declared separately to indicate their
 310  * slightly different functions.
 311  *
 312  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 313  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 314  * this arrangement is that the value can be examined by code that already
 315  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 316  * to the shared variable, each backend has a private copy of LogwrtResult,
 317  * which is updated when convenient.
 318  *
 319  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 320  * (protected by info_lck), but we don't need to cache any copies of it.
 321  *
 322  * info_lck is only held long enough to read/update the protected variables,
 323  * so it's a plain spinlock.  The other locks are held longer (potentially
 324  * over I/O operations), so we use LWLocks for them.  These locks are:
 325  *
 326  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 327  * It is only held while initializing and changing the mapping.  If the
 328  * contents of the buffer being replaced haven't been written yet, the mapping
 329  * lock is released while the write is done, and reacquired afterwards.
 330  *
 331  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 332  * XLogFlush).
 333  *
 334  * ControlFileLock: must be held to read/update control file or create
 335  * new log file.
 336  *
 337  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 338  * only one checkpointer at a time; currently, with all checkpoints done by
 339  * the checkpointer, this is just pro forma).
 340  *
 341  *----------
 342  */
 343
 344 typedef struct XLogwrtRqst
 345 {
 346         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 347         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 348 } XLogwrtRqst;
 349
 350 typedef struct XLogwrtResult
 351 {
 352         XLogRecPtr      Write;                  /* last byte + 1 written out */
 353         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 354 } XLogwrtResult;
 355
 356
 357 /*
 358  * A slot for inserting to the WAL. This is similar to an LWLock, the main
 359  * difference is that there is an extra xlogInsertingAt field that is protected
 360  * by the same mutex. Unlike an LWLock, a slot can only be acquired in
 361  * exclusive mode.
 362  *
 363  * The xlogInsertingAt field is used to advertise to other processes how far
 364  * the slot owner has progressed in inserting the record. When a backend
 365  * acquires a slot, it initializes xlogInsertingAt to 1, because it doesn't
 366  * yet know where it's going to insert the record. That's conservative
 367  * but correct; the new insertion is certainly going to go to a byte position
 368  * greater than 1. If another backend needs to flush the WAL, it will have to
 369  * wait for the new insertion. xlogInsertingAt is updated after finishing the
 370  * insert or when crossing a page boundary, which will wake up anyone waiting
 371  * for it, whether the wait was necessary in the first place or not.
 372  *
 373  * A process can wait on a slot in two modes: LW_EXCLUSIVE or
 374  * LW_WAIT_UNTIL_FREE. LW_EXCLUSIVE works like in an lwlock; when the slot is
 375  * released, the first LW_EXCLUSIVE waiter in the queue is woken up. Processes
 376  * waiting in LW_WAIT_UNTIL_FREE mode are woken up whenever the slot is
 377  * released, or xlogInsertingAt is updated. In other words, a process in
 378  * LW_WAIT_UNTIL_FREE mode is woken up whenever the inserter makes any progress
 379  * copying the record in place. LW_WAIT_UNTIL_FREE waiters are always added to
 380  * the front of the queue, while LW_EXCLUSIVE waiters are appended to the end.
 381  *
 382  * To join the wait queue, a process must set MyProc->lwWaitMode to the mode
 383  * it wants to wait in, MyProc->lwWaiting to true, and link MyProc to the head
 384  * or tail of the wait queue. The same mechanism is used to wait on an LWLock,
 385  * see lwlock.c for details.
 386  */
 387 typedef struct
 388 {
 389         slock_t         mutex;                  /* protects the below fields */
 390         XLogRecPtr      xlogInsertingAt; /* insert has completed up to this point */
 391
 392         PGPROC     *owner;                      /* for debugging purposes */
 393
 394         bool            releaseOK;              /* T if ok to release waiters */
 395         char            exclusive;              /* # of exclusive holders (0 or 1) */
 396         PGPROC     *head;                       /* head of list of waiting PGPROCs */
 397         PGPROC     *tail;                       /* tail of list of waiting PGPROCs */
 398         /* tail is undefined when head is NULL */
 399 } XLogInsertSlot;
 400
 401 /*
 402  * All the slots are allocated as an array in shared memory. We force the
 403  * array stride to be a power of 2, which saves a few cycles in indexing, but
 404  * more importantly also ensures that individual slots don't cross cache line
 405  * boundaries.  (Of course, we have to also ensure that the array start
 406  * address is suitably aligned.)
 407  */
 408 typedef union XLogInsertSlotPadded
 409 {
 410         XLogInsertSlot slot;
 411         char            pad[CACHE_LINE_SIZE];
 412 } XLogInsertSlotPadded;
 413
 414 /*
 415  * Shared state data for XLogInsert.
 416  */
 417 typedef struct XLogCtlInsert
 418 {
 419         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 420
 421         /*
 422          * CurrBytePos is the end of reserved WAL. The next record will be inserted
 423          * at that position. PrevBytePos is the start position of the previously
 424          * inserted (or rather, reserved) record - it is copied to the the prev-
 425          * link of the next record. These are stored as "usable byte positions"
 426          * rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
 427          */
 428         uint64          CurrBytePos;
 429         uint64          PrevBytePos;
 430
 431         /*
 432          * Make sure the above heavily-contended spinlock and byte positions are
 433          * on their own cache line. In particular, the RedoRecPtr and full page
 434          * write variables below should be on a different cache line. They are
 435          * read on every WAL insertion, but updated rarely, and we don't want
 436          * those reads to steal the cache line containing Curr/PrevBytePos.
 437          */
 438         char            pad[CACHE_LINE_SIZE];
 439
 440         /*
 441          * fullPageWrites is the master copy used by all backends to determine
 442          * whether to write full-page to WAL, instead of using process-local one.
 443          * This is required because, when full_page_writes is changed by SIGHUP,
 444          * we must WAL-log it before it actually affects WAL-logging by backends.
 445          * Checkpointer sets at startup or after SIGHUP.
 446          *
 447          * To read these fields, you must hold an insertion slot. To modify them,
 448          * you must hold ALL the slots.
 449          */
 450         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 451         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 452         bool            fullPageWrites;
 453
 454         /*
 455          * exclusiveBackup is true if a backup started with pg_start_backup() is
 456          * in progress, and nonExclusiveBackups is a counter indicating the number
 457          * of streaming base backups currently in progress. forcePageWrites is set
 458          * to true when either of these is non-zero. lastBackupStart is the latest
 459          * checkpoint redo location used as a starting point for an online backup.
 460          */
 461         bool            exclusiveBackup;
 462         int                     nonExclusiveBackups;
 463         XLogRecPtr      lastBackupStart;
 464
 465         /* insertion slots, see XLogInsertSlot struct above for details */
 466         XLogInsertSlotPadded *insertSlots;
 467 } XLogCtlInsert;
 468
 469 /*
 470  * Total shared-memory state for XLOG.
 471  */
 472 typedef struct XLogCtlData
 473 {
 474         XLogCtlInsert Insert;
 475
 476         /* Protected by info_lck: */
 477         XLogwrtRqst LogwrtRqst;
 478         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 479         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 480         TransactionId ckptXid;
 481         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 482         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
 483                                                                                  * segment */
 484
 485         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
 486         XLogRecPtr      unloggedLSN;
 487         slock_t         ulsn_lck;
 488
 489         /* Time of last xlog segment switch. Protected by WALWriteLock. */
 490         pg_time_t       lastSegSwitchTime;
 491
 492         /*
 493          * Protected by info_lck and WALWriteLock (you must hold either lock to
 494          * read it, but both to update)
 495          */
 496         XLogwrtResult LogwrtResult;
 497
 498         /*
 499          * Latest initialized page in the cache (last byte position + 1).
 500          *
 501          * To change the identity of a buffer (and InitializedUpTo), you need to
 502          * hold WALBufMappingLock.  To change the identity of a buffer that's still
 503          * dirty, the old page needs to be written out first, and for that you
 504          * need WALWriteLock, and you need to ensure that there are no in-progress
 505          * insertions to the page by calling WaitXLogInsertionsToFinish().
 506          */
 507         XLogRecPtr      InitializedUpTo;
 508
 509         /*
 510          * These values do not change after startup, although the pointed-to pages
 511          * and xlblocks values certainly do.  xlblock values are protected by
 512          * WALBufMappingLock.
 513          */
 514         char       *pages;                      /* buffers for unwritten XLOG pages */
 515         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 516         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 517
 518         /*
 519          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 520          * If we created a new timeline when the system was started up,
 521          * PrevTimeLineID is the old timeline's ID that we forked off from.
 522          * Otherwise it's equal to ThisTimeLineID.
 523          */
 524         TimeLineID      ThisTimeLineID;
 525         TimeLineID      PrevTimeLineID;
 526
 527         /*
 528          * archiveCleanupCommand is read from recovery.conf but needs to be in
 529          * shared memory so that the checkpointer process can access it.
 530          */
 531         char            archiveCleanupCommand[MAXPGPATH];
 532
 533         /*
 534          * SharedRecoveryInProgress indicates if we're still in crash or archive
 535          * recovery.  Protected by info_lck.
 536          */
 537         bool            SharedRecoveryInProgress;
 538
 539         /*
 540          * SharedHotStandbyActive indicates if we're still in crash or archive
 541          * recovery.  Protected by info_lck.
 542          */
 543         bool            SharedHotStandbyActive;
 544
 545         /*
 546          * WalWriterSleeping indicates whether the WAL writer is currently in
 547          * low-power mode (and hence should be nudged if an async commit occurs).
 548          * Protected by info_lck.
 549          */
 550         bool            WalWriterSleeping;
 551
 552         /*
 553          * recoveryWakeupLatch is used to wake up the startup process to continue
 554          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 555          * to appear.
 556          */
 557         Latch           recoveryWakeupLatch;
 558
 559         /*
 560          * During recovery, we keep a copy of the latest checkpoint record here.
 561          * Used by the background writer when it wants to create a restartpoint.
 562          *
 563          * Protected by info_lck.
 564          */
 565         XLogRecPtr      lastCheckPointRecPtr;
 566         CheckPoint      lastCheckPoint;
 567
 568         /*
 569          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 570          * replayed. When we're currently replaying a record, ie. in a redo
 571          * function, replayEndRecPtr points to the end+1 of the record being
 572          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 573          */
 574         XLogRecPtr      lastReplayedEndRecPtr;
 575         TimeLineID      lastReplayedTLI;
 576         XLogRecPtr      replayEndRecPtr;
 577         TimeLineID      replayEndTLI;
 578         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 579         TimestampTz recoveryLastXTime;
 580         /* current effective recovery target timeline */
 581         TimeLineID      RecoveryTargetTLI;
 582
 583         /*
 584          * timestamp of when we started replaying the current chunk of WAL data,
 585          * only relevant for replication or archive recovery
 586          */
 587         TimestampTz currentChunkStartTime;
 588         /* Are we requested to pause recovery? */
 589         bool            recoveryPause;
 590
 591         /*
 592          * lastFpwDisableRecPtr points to the start of the last replayed
 593          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 594          */
 595         XLogRecPtr      lastFpwDisableRecPtr;
 596
 597         slock_t         info_lck;               /* locks shared variables shown above */
 598 } XLogCtlData;
 599
 600 static XLogCtlData *XLogCtl = NULL;
 601
 602 /*
 603  * We maintain an image of pg_control in shared memory.
 604  */
 605 static ControlFileData *ControlFile = NULL;
 606
 607 /*
 608  * Calculate the amount of space left on the page after 'endptr'. Beware
 609  * multiple evaluation!
 610  */
 611 #define INSERT_FREESPACE(endptr)        \
 612         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 613
 614 /* Macro to advance to next buffer index. */
 615 #define NextBufIdx(idx)         \
 616                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 617
 618 /*
 619  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 620  * would hold if it was in cache, the page containing 'recptr'.
 621  */
 622 #define XLogRecPtrToBufIdx(recptr)      \
 623         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 624
 625 /*
 626  * These are the number of bytes in a WAL page and segment usable for WAL data.
 627  */
 628 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 629 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
 630
 631 /*
 632  * Private, possibly out-of-date copy of shared LogwrtResult.
 633  * See discussion above.
 634  */
 635 static XLogwrtResult LogwrtResult = {0, 0};
 636
 637 /*
 638  * Codes indicating where we got a WAL file from during recovery, or where
 639  * to attempt to get one.
 640  */
 641 typedef enum
 642 {
 643         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 644         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 645         XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
 646         XLOG_FROM_STREAM,                       /* streamed from master */
 647 } XLogSource;
 648
 649 /* human-readable names for XLogSources, for debugging output */
 650 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
 651
 652 /*
 653  * openLogFile is -1 or a kernel FD for an open log file segment.
 654  * When it's open, openLogOff is the current seek offset in the file.
 655  * openLogSegNo identifies the segment.  These variables are only
 656  * used to write the XLOG, and so will normally refer to the active segment.
 657  */
 658 static int      openLogFile = -1;
 659 static XLogSegNo openLogSegNo = 0;
 660 static uint32 openLogOff = 0;
 661
 662 /*
 663  * These variables are used similarly to the ones above, but for reading
 664  * the XLOG.  Note, however, that readOff generally represents the offset
 665  * of the page just read, not the seek position of the FD itself, which
 666  * will be just past that page. readLen indicates how much of the current
 667  * page has been read into readBuf, and readSource indicates where we got
 668  * the currently open file from.
 669  */
 670 static int      readFile = -1;
 671 static XLogSegNo readSegNo = 0;
 672 static uint32 readOff = 0;
 673 static uint32 readLen = 0;
 674 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
 675
 676 /*
 677  * Keeps track of which source we're currently reading from. This is
 678  * different from readSource in that this is always set, even when we don't
 679  * currently have a WAL file open. If lastSourceFailed is set, our last
 680  * attempt to read from currentSource failed, and we should try another source
 681  * next.
 682  */
 683 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 684 static bool lastSourceFailed = false;
 685
 686 typedef struct XLogPageReadPrivate
 687 {
 688         int                     emode;
 689         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 690         bool            randAccess;
 691 } XLogPageReadPrivate;
 692
 693 /*
 694  * These variables track when we last obtained some WAL data to process,
 695  * and where we got it from.  (XLogReceiptSource is initially the same as
 696  * readSource, but readSource gets reset to zero when we don't have data
 697  * to process right now.  It is also different from currentSource, which
 698  * also changes when we try to read from a source and fail, while
 699  * XLogReceiptSource tracks where we last successfully read some WAL.)
 700  */
 701 static TimestampTz XLogReceiptTime = 0;
 702 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
 703
 704 /* State information for XLOG reading */
 705 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 706 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 707
 708 static XLogRecPtr minRecoveryPoint;             /* local copy of
 709                                                                                  * ControlFile->minRecoveryPoint */
 710 static TimeLineID minRecoveryPointTLI;
 711 static bool updateMinRecoveryPoint = true;
 712
 713 /*
 714  * Have we reached a consistent database state? In crash recovery, we have
 715  * to replay all the WAL, so reachedConsistency is never set. During archive
 716  * recovery, the database is consistent once minRecoveryPoint is reached.
 717  */
 718 bool            reachedConsistency = false;
 719
 720 static bool InRedo = false;
 721
 722 /* Have we launched bgwriter during recovery? */
 723 static bool bgwriterLaunched = false;
 724
 725 /* For WALInsertSlotAcquire/Release functions */
 726 static int      MySlotNo = 0;
 727 static bool holdingAllSlots = false;
 728
 729 static void readRecoveryCommandFile(void);
 730 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
 731 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 732 static void recoveryPausesHere(void);
 733 static void SetLatestXTime(TimestampTz xtime);
 734 static void SetCurrentChunkStartTime(TimestampTz xtime);
 735 static void CheckRequiredParameterValues(void);
 736 static void XLogReportParameters(void);
 737 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 738                                         TimeLineID prevTLI);
 739 static void LocalSetXLogInsertAllowed(void);
 740 static void CreateEndOfRecoveryRecord(void);
 741 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 742 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 743
 744 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
 745                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 746 static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
 747                                                  char *blk, bool get_cleanup_lock, bool keep_buffer);
 748 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 749 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 750 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 751 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 752                                            bool find_free, int *max_advance,
 753                                            bool use_lock);
 754 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 755                          int source, bool notexistOk);
 756 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 757 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 758                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 759                          TimeLineID *readTLI);
 760 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 761                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 762 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 763 static void XLogFileClose(void);
 764 static void PreallocXlogFiles(XLogRecPtr endptr);
 765 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
 766 static void UpdateLastRemovedPtr(char *filename);
 767 static void ValidateXLOGDirectoryStructure(void);
 768 static void CleanupBackupHistory(void);
 769 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 770 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 771                    int emode, bool fetching_ckpt);
 772 static void CheckRecoveryConsistency(void);
 773 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 774                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 775 static bool rescanLatestTimeLine(void);
 776 static void WriteControlFile(void);
 777 static void ReadControlFile(void);
 778 static char *str_time(pg_time_t tnow);
 779 static bool CheckForStandbyTrigger(void);
 780
 781 #ifdef WAL_DEBUG
 782 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 783 #endif
 784 static void pg_start_backup_callback(int code, Datum arg);
 785 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 786                                   bool *backupEndRequired, bool *backupFromStandby);
 787 static void rm_redo_error_callback(void *arg);
 788 static int      get_sync_bit(int method);
 789
 790 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 791                                   XLogRecData *rdata,
 792                                   XLogRecPtr StartPos, XLogRecPtr EndPos);
 793 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 794                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 795 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 796                                   XLogRecPtr *PrevPtr);
 797 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 798 static void WakeupWaiters(XLogRecPtr EndPos);
 799 static char *GetXLogBuffer(XLogRecPtr ptr);
 800 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 801 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 802 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 803
 804 static void WALInsertSlotAcquire(bool exclusive);
 805 static void WALInsertSlotAcquireOne(int slotno);
 806 static void WALInsertSlotRelease(void);
 807 static void WALInsertSlotReleaseOne(int slotno);
 808
 809 /*
 810  * Insert an XLOG record having the specified RMID and info bytes,
 811  * with the body of the record being the data chunk(s) described by
 812  * the rdata chain (see xlog.h for notes about rdata).
 813  *
 814  * Returns XLOG pointer to end of record (beginning of next record).
 815  * This can be used as LSN for data pages affected by the logged action.
 816  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 817  * before the data page can be written out.  This implements the basic
 818  * WAL rule "write the log before the data".)
 819  *
 820  * NB: this routine feels free to scribble on the XLogRecData structs,
 821  * though not on the data they reference.  This is OK since the XLogRecData
 822  * structs are always just temporaries in the calling code.
 823  */
 824 XLogRecPtr
 825 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 826 {
 827         XLogCtlInsert *Insert = &XLogCtl->Insert;
 828         XLogRecData *rdt;
 829         XLogRecData *rdt_lastnormal;
 830         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 831         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 832         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 833         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 834         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 835         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 836         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 837         XLogRecData hdr_rdt;
 838         pg_crc32        rdata_crc;
 839         uint32          len,
 840                                 write_len;
 841         unsigned        i;
 842         bool            doPageWrites;
 843         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 844         bool            inserted;
 845         uint8           info_orig = info;
 846         static XLogRecord *rechdr;
 847         XLogRecPtr      StartPos;
 848         XLogRecPtr      EndPos;
 849
 850         if (rechdr == NULL)
 851         {
 852                 rechdr = malloc(SizeOfXLogRecord);
 853                 if (rechdr == NULL)
 854                         elog(ERROR, "out of memory");
 855                 MemSet(rechdr, 0, SizeOfXLogRecord);
 856         }
 857
 858         /* cross-check on whether we should be here or not */
 859         if (!XLogInsertAllowed())
 860                 elog(ERROR, "cannot make new WAL entries during recovery");
 861
 862         /* info's high bits are reserved for use by me */
 863         if (info & XLR_INFO_MASK)
 864                 elog(PANIC, "invalid xlog info mask %02X", info);
 865
 866         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 867
 868         /*
 869          * In bootstrap mode, we don't actually log anything but XLOG resources;
 870          * return a phony record pointer.
 871          */
 872         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 873         {
 874                 EndPos = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 875                 return EndPos;
 876         }
 877
 878         /*
 879          * Here we scan the rdata chain, to determine which buffers must be backed
 880          * up.
 881          *
 882          * We may have to loop back to here if a race condition is detected below.
 883          * We could prevent the race by doing all this work while holding an
 884          * insertion slot, but it seems better to avoid doing CRC calculations
 885          * while holding one.
 886          *
 887          * We add entries for backup blocks to the chain, so that they don't need
 888          * any special treatment in the critical section where the chunks are
 889          * copied into the WAL buffers. Those entries have to be unlinked from the
 890          * chain if we have to loop back here.
 891          */
 892 begin:;
 893         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 894         {
 895                 dtbuf[i] = InvalidBuffer;
 896                 dtbuf_bkp[i] = false;
 897         }
 898
 899         /*
 900          * Decide if we need to do full-page writes in this XLOG record: true if
 901          * full_page_writes is on or we have a PITR request for it.  Since we
 902          * don't yet have an insertion slot, fullPageWrites and forcePageWrites
 903          * could change under us, but we'll recheck them once we have a slot.
 904          */
 905         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 906
 907         len = 0;
 908         for (rdt = rdata;;)
 909         {
 910                 if (rdt->buffer == InvalidBuffer)
 911                 {
 912                         /* Simple data, just include it */
 913                         len += rdt->len;
 914                 }
 915                 else
 916                 {
 917                         /* Find info for buffer */
 918                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 919                         {
 920                                 if (rdt->buffer == dtbuf[i])
 921                                 {
 922                                         /* Buffer already referenced by earlier chain item */
 923                                         if (dtbuf_bkp[i])
 924                                         {
 925                                                 rdt->data = NULL;
 926                                                 rdt->len = 0;
 927                                         }
 928                                         else if (rdt->data)
 929                                                 len += rdt->len;
 930                                         break;
 931                                 }
 932                                 if (dtbuf[i] == InvalidBuffer)
 933                                 {
 934                                         /* OK, put it in this slot */
 935                                         dtbuf[i] = rdt->buffer;
 936                                         if (doPageWrites && XLogCheckBuffer(rdt, true,
 937                                                                                    &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 938                                         {
 939                                                 dtbuf_bkp[i] = true;
 940                                                 rdt->data = NULL;
 941                                                 rdt->len = 0;
 942                                         }
 943                                         else if (rdt->data)
 944                                                 len += rdt->len;
 945                                         break;
 946                                 }
 947                         }
 948                         if (i >= XLR_MAX_BKP_BLOCKS)
 949                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 950                                          XLR_MAX_BKP_BLOCKS);
 951                 }
 952                 /* Break out of loop when rdt points to last chain item */
 953                 if (rdt->next == NULL)
 954                         break;
 955                 rdt = rdt->next;
 956         }
 957
 958         /*
 959          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 960          * error checking in ReadRecord.  This means that all callers of
 961          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 962          * make an exception for XLOG SWITCH records because we don't want them to
 963          * ever cross a segment boundary.
 964          */
 965         if (len == 0 && !isLogSwitch)
 966                 elog(PANIC, "invalid xlog record length %u", len);
 967
 968         /*
 969          * Make additional rdata chain entries for the backup blocks, so that we
 970          * don't need to special-case them in the write loop.  This modifies the
 971          * original rdata chain, but we keep a pointer to the last regular entry,
 972          * rdt_lastnormal, so that we can undo this if we have to loop back to the
 973          * beginning.
 974          *
 975          * At the exit of this loop, write_len includes the backup block data.
 976          *
 977          * Also set the appropriate info bits to show which buffers were backed
 978          * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
 979          * value (ignoring InvalidBuffer) appearing in the rdata chain.
 980          */
 981         rdt_lastnormal = rdt;
 982         write_len = len;
 983         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 984         {
 985                 BkpBlock   *bkpb;
 986                 char       *page;
 987
 988                 if (!dtbuf_bkp[i])
 989                         continue;
 990
 991                 info |= XLR_BKP_BLOCK(i);
 992
 993                 bkpb = &(dtbuf_xlg[i]);
 994                 page = (char *) BufferGetBlock(dtbuf[i]);
 995
 996                 rdt->next = &(dtbuf_rdt1[i]);
 997                 rdt = rdt->next;
 998
 999                 rdt->data = (char *) bkpb;
1000                 rdt->len = sizeof(BkpBlock);
1001                 write_len += sizeof(BkpBlock);
1002
1003                 rdt->next = &(dtbuf_rdt2[i]);
1004                 rdt = rdt->next;
1005
1006                 if (bkpb->hole_length == 0)
1007                 {
1008                         rdt->data = page;
1009                         rdt->len = BLCKSZ;
1010                         write_len += BLCKSZ;
1011                         rdt->next = NULL;
1012                 }
1013                 else
1014                 {
1015                         /* must skip the hole */
1016                         rdt->data = page;
1017                         rdt->len = bkpb->hole_offset;
1018                         write_len += bkpb->hole_offset;
1019
1020                         rdt->next = &(dtbuf_rdt3[i]);
1021                         rdt = rdt->next;
1022
1023                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
1024                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
1025                         write_len += rdt->len;
1026                         rdt->next = NULL;
1027                 }
1028         }
1029
1030         /*
1031          * Calculate CRC of the data, including all the backup blocks
1032          *
1033          * Note that the record header isn't added into the CRC initially since we
1034          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
1035          * the whole record in the order: rdata, then backup blocks, then record
1036          * header.
1037          */
1038         INIT_CRC32(rdata_crc);
1039         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
1040                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
1041
1042         /*
1043          * Construct record header (prev-link is filled in later, after reserving
1044          * the space for the record), and make that the first chunk in the chain.
1045          *
1046          * The CRC calculated for the header here doesn't include prev-link,
1047          * because we don't know it yet. It will be added later.
1048          */
1049         rechdr->xl_xid = GetCurrentTransactionIdIfAny();
1050         rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
1051         rechdr->xl_len = len;           /* doesn't include backup blocks */
1052         rechdr->xl_info = info;
1053         rechdr->xl_rmid = rmid;
1054         rechdr->xl_prev = InvalidXLogRecPtr;
1055         COMP_CRC32(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev));
1056
1057         hdr_rdt.next = rdata;
1058         hdr_rdt.data = (char *) rechdr;
1059         hdr_rdt.len = SizeOfXLogRecord;
1060         write_len += SizeOfXLogRecord;
1061
1062         /*----------
1063          *
1064          * We have now done all the preparatory work we can without holding a
1065          * lock or modifying shared state. From here on, inserting the new WAL
1066          * record to the shared WAL buffer cache is a two-step process:
1067          *
1068          * 1. Reserve the right amount of space from the WAL. The current head of
1069          *    reserved space is kept in Insert->CurrBytePos, and is protected by
1070          *    insertpos_lck.
1071          *
1072          * 2. Copy the record to the reserved WAL space. This involves finding the
1073          *    correct WAL buffer containing the reserved space, and copying the
1074          *    record in place. This can be done concurrently in multiple processes.
1075          *
1076          * To keep track of which insertions are still in-progress, each concurrent
1077          * inserter allocates an "insertion slot", which tells others how far the
1078          * inserter has progressed. There is a small fixed number of insertion
1079          * slots, determined by the num_xloginsert_slots GUC. When an inserter
1080          * finishes, it updates the xlogInsertingAt of its slot to the end of the
1081          * record it inserted, to let others know that it's done. xlogInsertingAt
1082          * is also updated when crossing over to a new WAL buffer, to allow the
1083          * the previous buffer to be flushed.
1084          *
1085          * Holding onto a slot also protects RedoRecPtr and fullPageWrites from
1086          * changing until the insertion is finished.
1087          *
1088          * Step 2 can usually be done completely in parallel. If the required WAL
1089          * page is not initialized yet, you have to grab WALBufMappingLock to
1090          * initialize it, but the WAL writer tries to do that ahead of insertions
1091          * to avoid that from happening in the critical path.
1092          *
1093          *----------
1094          */
1095         START_CRIT_SECTION();
1096         WALInsertSlotAcquire(isLogSwitch);
1097
1098         /*
1099          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
1100          * back and recompute everything.  This can only happen just after a
1101          * checkpoint, so it's better to be slow in this case and fast otherwise.
1102          *
1103          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1104          * affect the contents of the XLOG record, so we'll update our local copy
1105          * but not force a recomputation.
1106          */
1107         if (RedoRecPtr != Insert->RedoRecPtr)
1108         {
1109                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1110                 RedoRecPtr = Insert->RedoRecPtr;
1111
1112                 if (doPageWrites)
1113                 {
1114                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1115                         {
1116                                 if (dtbuf[i] == InvalidBuffer)
1117                                         continue;
1118                                 if (dtbuf_bkp[i] == false &&
1119                                         dtbuf_lsn[i] <= RedoRecPtr)
1120                                 {
1121                                         /*
1122                                          * Oops, this buffer now needs to be backed up, but we
1123                                          * didn't think so above.  Start over.
1124                                          */
1125                                         WALInsertSlotRelease();
1126                                         END_CRIT_SECTION();
1127                                         rdt_lastnormal->next = NULL;
1128                                         info = info_orig;
1129                                         goto begin;
1130                                 }
1131                         }
1132                 }
1133         }
1134
1135         /*
1136          * Also check to see if fullPageWrites or forcePageWrites was just turned
1137          * on; if we weren't already doing full-page writes then go back and
1138          * recompute. (If it was just turned off, we could recompute the record
1139          * without full pages, but we choose not to bother.)
1140          */
1141         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
1142         {
1143                 /* Oops, must redo it with full-page data. */
1144                 WALInsertSlotRelease();
1145                 END_CRIT_SECTION();
1146                 rdt_lastnormal->next = NULL;
1147                 info = info_orig;
1148                 goto begin;
1149         }
1150
1151         /*
1152          * Reserve space for the record in the WAL. This also sets the xl_prev
1153          * pointer.
1154          */
1155         if (isLogSwitch)
1156                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1157         else
1158         {
1159                 ReserveXLogInsertLocation(write_len, &StartPos, &EndPos,
1160                                                                   &rechdr->xl_prev);
1161                 inserted = true;
1162         }
1163
1164         if (inserted)
1165         {
1166                 /*
1167                  * Now that xl_prev has been filled in, finish CRC calculation of the
1168                  * record header.
1169                  */
1170                 COMP_CRC32(rdata_crc, ((char *) &rechdr->xl_prev), sizeof(XLogRecPtr));
1171                 FIN_CRC32(rdata_crc);
1172                 rechdr->xl_crc = rdata_crc;
1173
1174                 /*
1175                  * All the record data, including the header, is now ready to be
1176                  * inserted. Copy the record in the space reserved.
1177                  */
1178                 CopyXLogRecordToWAL(write_len, isLogSwitch, &hdr_rdt, StartPos, EndPos);
1179         }
1180         else
1181         {
1182                 /*
1183                  * This was an xlog-switch record, but the current insert location was
1184                  * already exactly at the beginning of a segment, so there was no need
1185                  * to do anything.
1186                  */
1187         }
1188
1189         /*
1190          * Done! Let others know that we're finished.
1191          */
1192         WALInsertSlotRelease();
1193
1194         END_CRIT_SECTION();
1195
1196         /*
1197          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1198          */
1199         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1200         {
1201                 /* use volatile pointer to prevent code rearrangement */
1202                 volatile XLogCtlData *xlogctl = XLogCtl;
1203
1204                 SpinLockAcquire(&xlogctl->info_lck);
1205                 /* advance global request to include new block(s) */
1206                 if (xlogctl->LogwrtRqst.Write < EndPos)
1207                         xlogctl->LogwrtRqst.Write = EndPos;
1208                 /* update local result copy while I have the chance */
1209                 LogwrtResult = xlogctl->LogwrtResult;
1210                 SpinLockRelease(&xlogctl->info_lck);
1211         }
1212
1213         /*
1214          * If this was an XLOG_SWITCH record, flush the record and the empty
1215          * padding space that fills the rest of the segment, and perform
1216          * end-of-segment actions (eg, notifying archiver).
1217          */
1218         if (isLogSwitch)
1219         {
1220                 TRACE_POSTGRESQL_XLOG_SWITCH();
1221                 XLogFlush(EndPos);
1222                 /*
1223                  * Even though we reserved the rest of the segment for us, which is
1224                  * reflected in EndPos, we return a pointer to just the end of the
1225                  * xlog-switch record.
1226                  */
1227                 if (inserted)
1228                 {
1229                         EndPos = StartPos + SizeOfXLogRecord;
1230                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1231                         {
1232                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1233                                         EndPos += SizeOfXLogLongPHD;
1234                                 else
1235                                         EndPos += SizeOfXLogShortPHD;
1236                         }
1237                 }
1238         }
1239
1240 #ifdef WAL_DEBUG
1241         if (XLOG_DEBUG)
1242         {
1243                 StringInfoData buf;
1244
1245                 initStringInfo(&buf);
1246                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1247                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1248                 xlog_outrec(&buf, rechdr);
1249                 if (rdata->data != NULL)
1250                 {
1251                         appendStringInfoString(&buf, " - ");
1252                         RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
1253                 }
1254                 elog(LOG, "%s", buf.data);
1255                 pfree(buf.data);
1256         }
1257 #endif
1258
1259         /*
1260          * Update our global variables
1261          */
1262         ProcLastRecPtr = StartPos;
1263         XactLastRecEnd = EndPos;
1264
1265         return EndPos;
1266 }
1267
1268 /*
1269  * Reserves the right amount of space for a record of given size from the WAL.
1270  * *StartPos is set to the beginning of the reserved section, *EndPos to
1271  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1272  * used to set the xl_prev of this record.
1273  *
1274  * This is the performance critical part of XLogInsert that must be serialized
1275  * across backends. The rest can happen mostly in parallel. Try to keep this
1276  * section as short as possible, insertpos_lck can be heavily contended on a
1277  * busy system.
1278  *
1279  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1280  * where we actually copy the record to the reserved space.
1281  */
1282 static void
1283 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1284                                                   XLogRecPtr *PrevPtr)
1285 {
1286         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1287         uint64          startbytepos;
1288         uint64          endbytepos;
1289         uint64          prevbytepos;
1290
1291         size = MAXALIGN(size);
1292
1293         /* All (non xlog-switch) records should contain data. */
1294         Assert(size > SizeOfXLogRecord);
1295
1296         /*
1297          * The duration the spinlock needs to be held is minimized by minimizing
1298          * the calculations that have to be done while holding the lock. The
1299          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1300          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1301          * page headers. The mapping between "usable" byte positions and physical
1302          * positions (XLogRecPtrs) can be done outside the locked region, and
1303          * because the usable byte position doesn't include any headers, reserving
1304          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1305          */
1306         SpinLockAcquire(&Insert->insertpos_lck);
1307
1308         startbytepos = Insert->CurrBytePos;
1309         endbytepos = startbytepos + size;
1310         prevbytepos = Insert->PrevBytePos;
1311         Insert->CurrBytePos = endbytepos;
1312         Insert->PrevBytePos = startbytepos;
1313
1314         SpinLockRelease(&Insert->insertpos_lck);
1315
1316         *StartPos = XLogBytePosToRecPtr(startbytepos);
1317         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1318         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1319
1320         /*
1321          * Check that the conversions between "usable byte positions" and
1322          * XLogRecPtrs work consistently in both directions.
1323          */
1324         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1325         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1326         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1327 }
1328
1329 /*
1330  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1331  *
1332  * A log-switch record is handled slightly differently. The rest of the
1333  * segment will be reserved for this insertion, as indicated by the returned
1334  * *EndPos value. However, if we are already at the beginning of the current
1335  * segment, *StartPos and *EndPos are set to the current location without
1336  * reserving any space, and the function returns false.
1337 */
1338 static bool
1339 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1340 {
1341         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1342         uint64          startbytepos;
1343         uint64          endbytepos;
1344         uint64          prevbytepos;
1345         uint32          size = SizeOfXLogRecord;
1346         XLogRecPtr      ptr;
1347         uint32          segleft;
1348
1349         /*
1350          * These calculations are a bit heavy-weight to be done while holding a
1351          * spinlock, but since we're holding all the WAL insertion slots, there
1352          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1353          * compete for it, but that's not called very frequently.
1354          */
1355         SpinLockAcquire(&Insert->insertpos_lck);
1356
1357         startbytepos = Insert->CurrBytePos;
1358
1359         ptr = XLogBytePosToEndRecPtr(startbytepos);
1360         if (ptr % XLOG_SEG_SIZE == 0)
1361         {
1362                 SpinLockRelease(&Insert->insertpos_lck);
1363                 *EndPos = *StartPos = ptr;
1364                 return false;
1365         }
1366
1367         endbytepos = startbytepos + size;
1368         prevbytepos = Insert->PrevBytePos;
1369
1370         *StartPos = XLogBytePosToRecPtr(startbytepos);
1371         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1372
1373         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1374         if (segleft != XLOG_SEG_SIZE)
1375         {
1376                 /* consume the rest of the segment */
1377                 *EndPos += segleft;
1378                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1379         }
1380         Insert->CurrBytePos = endbytepos;
1381         Insert->PrevBytePos = startbytepos;
1382
1383         SpinLockRelease(&Insert->insertpos_lck);
1384
1385         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1386
1387         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1388         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1389         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1390         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1391
1392         return true;
1393 }
1394
1395 /*
1396  * Subroutine of XLogInsert.  Copies a WAL record to an already-reserved
1397  * area in the WAL.
1398  */
1399 static void
1400 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1401                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1402 {
1403         char       *currpos;
1404         int                     freespace;
1405         int                     written;
1406         XLogRecPtr      CurrPos;
1407         XLogPageHeader pagehdr;
1408
1409         /* The first chunk is the record header */
1410         Assert(rdata->len == SizeOfXLogRecord);
1411
1412         /*
1413          * Get a pointer to the right place in the right WAL buffer to start
1414          * inserting to.
1415          */
1416         CurrPos = StartPos;
1417         currpos = GetXLogBuffer(CurrPos);
1418         freespace = INSERT_FREESPACE(CurrPos);
1419
1420         /*
1421          * there should be enough space for at least the first field (xl_tot_len)
1422          * on this page.
1423          */
1424         Assert(freespace >= sizeof(uint32));
1425
1426         /* Copy record data */
1427         written = 0;
1428         while (rdata != NULL)
1429         {
1430                 char       *rdata_data = rdata->data;
1431                 int                     rdata_len = rdata->len;
1432
1433                 while (rdata_len > freespace)
1434                 {
1435                         /*
1436                          * Write what fits on this page, and continue on the next page.
1437                          */
1438                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1439                         memcpy(currpos, rdata_data, freespace);
1440                         rdata_data += freespace;
1441                         rdata_len -= freespace;
1442                         written += freespace;
1443                         CurrPos += freespace;
1444
1445                         /*
1446                          * Get pointer to beginning of next page, and set the xlp_rem_len
1447                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1448                          *
1449                          * It's safe to set the contrecord flag and xlp_rem_len without a
1450                          * lock on the page. All the other flags were already set when the
1451                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1452                          * only backend that needs to set the contrecord flag.
1453                          */
1454                         currpos = GetXLogBuffer(CurrPos);
1455                         pagehdr = (XLogPageHeader) currpos;
1456                         pagehdr->xlp_rem_len = write_len - written;
1457                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1458
1459                         /* skip over the page header */
1460                         if (CurrPos % XLogSegSize == 0)
1461                         {
1462                                 CurrPos += SizeOfXLogLongPHD;
1463                                 currpos += SizeOfXLogLongPHD;
1464                         }
1465                         else
1466                         {
1467                                 CurrPos += SizeOfXLogShortPHD;
1468                                 currpos += SizeOfXLogShortPHD;
1469                         }
1470                         freespace = INSERT_FREESPACE(CurrPos);
1471                 }
1472
1473                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1474                 memcpy(currpos, rdata_data, rdata_len);
1475                 currpos += rdata_len;
1476                 CurrPos += rdata_len;
1477                 freespace -= rdata_len;
1478                 written += rdata_len;
1479
1480                 rdata = rdata->next;
1481         }
1482         Assert(written == write_len);
1483
1484         /* Align the end position, so that the next record starts aligned */
1485         CurrPos = MAXALIGN64(CurrPos);
1486
1487         /*
1488          * If this was an xlog-switch, it's not enough to write the switch record,
1489          * we also have to consume all the remaining space in the WAL segment.
1490          * We have already reserved it for us, but we still need to make sure it's
1491          * allocated and zeroed in the WAL buffers so that when the caller (or
1492          * someone else) does XLogWrite(), it can really write out all the zeros.
1493          */
1494         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1495         {
1496                 /* An xlog-switch record doesn't contain any data besides the header */
1497                 Assert(write_len == SizeOfXLogRecord);
1498
1499                 /*
1500                  * We do this one page at a time, to make sure we don't deadlock
1501                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1502                  */
1503                 Assert(EndPos % XLogSegSize == 0);
1504
1505                 /* Use up all the remaining space on the first page */
1506                 CurrPos += freespace;
1507
1508                 while (CurrPos < EndPos)
1509                 {
1510                         /* initialize the next page (if not initialized already) */
1511                         WakeupWaiters(CurrPos);
1512                         AdvanceXLInsertBuffer(CurrPos, false);
1513                         CurrPos += XLOG_BLCKSZ;
1514                 }
1515         }
1516
1517         if (CurrPos != EndPos)
1518                 elog(PANIC, "space reserved for WAL record does not match what was written");
1519 }
1520
1521 /*
1522  * Allocate a slot for insertion.
1523  *
1524  * In exclusive mode, all slots are reserved for the current process. That
1525  * blocks all concurrent insertions.
1526  */
1527 static void
1528 WALInsertSlotAcquire(bool exclusive)
1529 {
1530         int                     i;
1531
1532         if (exclusive)
1533         {
1534                 for (i = 0; i < num_xloginsert_slots; i++)
1535                         WALInsertSlotAcquireOne(i);
1536                 holdingAllSlots = true;
1537         }
1538         else
1539                 WALInsertSlotAcquireOne(-1);
1540 }
1541
1542 /*
1543  * Workhorse of WALInsertSlotAcquire. Acquires the given slot, or an arbitrary
1544  * one if slotno == -1. The index of the slot that was acquired is stored in
1545  * MySlotNo.
1546  *
1547  * This is more or less equivalent to LWLockAcquire().
1548  */
1549 static void
1550 WALInsertSlotAcquireOne(int slotno)
1551 {
1552         volatile XLogInsertSlot *slot;
1553         PGPROC     *proc = MyProc;
1554         bool            retry = false;
1555         int                     extraWaits = 0;
1556         static int      slotToTry = -1;
1557
1558         /*
1559          * Try to use the slot we used last time. If the system isn't particularly
1560          * busy, it's a good bet that it's available, and it's good to have some
1561          * affinity to a particular slot so that you don't unnecessarily bounce
1562          * cache lines between processes when there is no contention.
1563          *
1564          * If this is the first time through in this backend, pick a slot
1565          * (semi-)randomly. This allows the slots to be used evenly if you have a
1566          * lot of very short connections.
1567          */
1568         if (slotno != -1)
1569                 MySlotNo = slotno;
1570         else
1571         {
1572                 if (slotToTry == -1)
1573                         slotToTry = MyProc->pgprocno % num_xloginsert_slots;
1574                 MySlotNo = slotToTry;
1575         }
1576
1577         /*
1578          * We can't wait if we haven't got a PGPROC.  This should only occur
1579          * during bootstrap or shared memory initialization.  Put an Assert here
1580          * to catch unsafe coding practices.
1581          */
1582         Assert(MyProc != NULL);
1583
1584         /*
1585          * Lock out cancel/die interrupts until we exit the code section protected
1586          * by the slot.  This ensures that interrupts will not interfere with
1587          * manipulations of data structures in shared memory. There is no cleanup
1588          * mechanism to release the slot if the backend dies while holding one,
1589          * so make this a critical section.
1590          */
1591         START_CRIT_SECTION();
1592
1593         /*
1594          * Loop here to try to acquire slot after each time we are signaled by
1595          * WALInsertSlotRelease.
1596          */
1597         for (;;)
1598         {
1599                 bool            mustwait;
1600
1601                 slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1602
1603                 /* Acquire mutex.  Time spent holding mutex should be short! */
1604                 SpinLockAcquire(&slot->mutex);
1605
1606                 /* If retrying, allow WALInsertSlotRelease to release waiters again */
1607                 if (retry)
1608                         slot->releaseOK = true;
1609
1610                 /* If I can get the slot, do so quickly. */
1611                 if (slot->exclusive == 0)
1612                 {
1613                         slot->exclusive++;
1614                         mustwait = false;
1615                 }
1616                 else
1617                         mustwait = true;
1618
1619                 if (!mustwait)
1620                         break;                          /* got the lock */
1621
1622                 Assert(slot->owner != MyProc);
1623
1624                 /*
1625                  * Add myself to wait queue.
1626                  */
1627                 proc->lwWaiting = true;
1628                 proc->lwWaitMode = LW_EXCLUSIVE;
1629                 proc->lwWaitLink = NULL;
1630                 if (slot->head == NULL)
1631                         slot->head = proc;
1632                 else
1633                         slot->tail->lwWaitLink = proc;
1634                 slot->tail = proc;
1635
1636                 /* Can release the mutex now */
1637                 SpinLockRelease(&slot->mutex);
1638
1639                 /*
1640                  * Wait until awakened.
1641                  *
1642                  * Since we share the process wait semaphore with the regular lock
1643                  * manager and ProcWaitForSignal, and we may need to acquire a slot
1644                  * while one of those is pending, it is possible that we get awakened
1645                  * for a reason other than being signaled by WALInsertSlotRelease. If
1646                  * so, loop back and wait again.  Once we've gotten the slot,
1647                  * re-increment the sema by the number of additional signals received,
1648                  * so that the lock manager or signal manager will see the received
1649                  * signal when it next waits.
1650                  */
1651                 for (;;)
1652                 {
1653                         /* "false" means cannot accept cancel/die interrupt here. */
1654                         PGSemaphoreLock(&proc->sem, false);
1655                         if (!proc->lwWaiting)
1656                                 break;
1657                         extraWaits++;
1658                 }
1659
1660                 /* Now loop back and try to acquire lock again. */
1661                 retry = true;
1662         }
1663
1664         slot->owner = proc;
1665
1666         /*
1667          * Normally, we initialize the xlogInsertingAt value of the slot to 1,
1668          * because we don't yet know where in the WAL we're going to insert. It's
1669          * not critical what it points to right now - leaving it to a too small
1670          * value just means that WaitXlogInsertionsToFinish() might wait on us
1671          * unnecessarily, until we update the value (when we finish the insert or
1672          * move to next page).
1673          *
1674          * If we're grabbing all the slots, however, stamp all but the last one
1675          * with InvalidXLogRecPtr, meaning there is no insert in progress. The last
1676          * slot is the one that we will update as we proceed with the insert, the
1677          * rest are held just to keep off other inserters.
1678          */
1679         if (slotno != -1 && slotno != num_xloginsert_slots - 1)
1680                 slot->xlogInsertingAt = InvalidXLogRecPtr;
1681         else
1682                 slot->xlogInsertingAt = 1;
1683
1684         /* We are done updating shared state of the slot itself. */
1685         SpinLockRelease(&slot->mutex);
1686
1687         /*
1688          * Fix the process wait semaphore's count for any absorbed wakeups.
1689          */
1690         while (extraWaits-- > 0)
1691                 PGSemaphoreUnlock(&proc->sem);
1692
1693         /*
1694          * If we couldn't get the slot immediately, try another slot next time.
1695          * On a system with more insertion slots than concurrent inserters, this
1696          * causes all the inserters to eventually migrate to a slot that no-one
1697          * else is using. On a system with more inserters than slots, it still
1698          * causes the inserters to be distributed quite evenly across the slots.
1699          */
1700         if (slotno != -1 && retry)
1701                 slotToTry = (slotToTry + 1) % num_xloginsert_slots;
1702 }
1703
1704 /*
1705  * Wait for the given slot to become free, or for its xlogInsertingAt location
1706  * to change to something else than 'waitptr'. In other words, wait for the
1707  * inserter using the given slot to finish its insertion, or to at least make
1708  * some progress.
1709  */
1710 static void
1711 WaitOnSlot(volatile XLogInsertSlot *slot, XLogRecPtr waitptr)
1712 {
1713         PGPROC     *proc = MyProc;
1714         int                     extraWaits = 0;
1715
1716         /*
1717          * Lock out cancel/die interrupts while we sleep on the slot. There is
1718          * no cleanup mechanism to remove us from the wait queue if we got
1719          * interrupted.
1720          */
1721         HOLD_INTERRUPTS();
1722
1723         /*
1724          * Loop here to try to acquire lock after each time we are signaled.
1725          */
1726         for (;;)
1727         {
1728                 bool            mustwait;
1729
1730                 /* Acquire mutex.  Time spent holding mutex should be short! */
1731                 SpinLockAcquire(&slot->mutex);
1732
1733                 /* If I can get the lock, do so quickly. */
1734                 if (slot->exclusive == 0 || slot->xlogInsertingAt != waitptr)
1735                         mustwait = false;
1736                 else
1737                         mustwait = true;
1738
1739                 if (!mustwait)
1740                         break;                          /* the lock was free */
1741
1742                 Assert(slot->owner != MyProc);
1743
1744                 /*
1745                  * Add myself to wait queue.
1746                  */
1747                 proc->lwWaiting = true;
1748                 proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
1749                 proc->lwWaitLink = NULL;
1750
1751                 /* waiters are added to the front of the queue */
1752                 proc->lwWaitLink = slot->head;
1753                 if (slot->head == NULL)
1754                         slot->tail = proc;
1755                 slot->head = proc;
1756
1757                 /* Can release the mutex now */
1758                 SpinLockRelease(&slot->mutex);
1759
1760                 /*
1761                  * Wait until awakened.
1762                  *
1763                  * Since we share the process wait semaphore with other things, like
1764                  * the regular lock manager and ProcWaitForSignal, and we may need to
1765                  * acquire an LWLock while one of those is pending, it is possible that
1766                  * we get awakened for a reason other than being signaled by
1767                  * LWLockRelease. If so, loop back and wait again.  Once we've gotten
1768                  * the LWLock, re-increment the sema by the number of additional
1769                  * signals received, so that the lock manager or signal manager will
1770                  * see the received signal when it next waits.
1771                  */
1772                 for (;;)
1773                 {
1774                         /* "false" means cannot accept cancel/die interrupt here. */
1775                         PGSemaphoreLock(&proc->sem, false);
1776                         if (!proc->lwWaiting)
1777                                 break;
1778                         extraWaits++;
1779                 }
1780
1781                 /* Now loop back and try to acquire lock again. */
1782         }
1783
1784         /* We are done updating shared state of the lock itself. */
1785         SpinLockRelease(&slot->mutex);
1786
1787         /*
1788          * Fix the process wait semaphore's count for any absorbed wakeups.
1789          */
1790         while (extraWaits-- > 0)
1791                 PGSemaphoreUnlock(&proc->sem);
1792
1793         /*
1794          * Now okay to allow cancel/die interrupts.
1795          */
1796         RESUME_INTERRUPTS();
1797 }
1798
1799 /*
1800  * Wake up all processes waiting for us with WaitOnSlot(). Sets our
1801  * xlogInsertingAt value to EndPos, without releasing the slot.
1802  */
1803 static void
1804 WakeupWaiters(XLogRecPtr EndPos)
1805 {
1806         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1807         PGPROC     *head;
1808         PGPROC     *proc;
1809         PGPROC     *next;
1810
1811         /*
1812          * If we have already reported progress up to the same point, do nothing.
1813          * No other process can modify xlogInsertingAt, so we can check this before
1814          * grabbing the spinlock.
1815          */
1816         if (slot->xlogInsertingAt == EndPos)
1817                 return;
1818         /* xlogInsertingAt should not go backwards */
1819         Assert(slot->xlogInsertingAt < EndPos);
1820
1821         /* Acquire mutex.  Time spent holding mutex should be short! */
1822         SpinLockAcquire(&slot->mutex);
1823
1824         /* we should own the slot */
1825         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1826
1827         slot->xlogInsertingAt = EndPos;
1828
1829         /*
1830          * See if there are any waiters that need to be woken up.
1831          */
1832         head = slot->head;
1833
1834         if (head != NULL)
1835         {
1836                 proc = head;
1837
1838                 /* LW_WAIT_UNTIL_FREE waiters are always in the front of the queue */
1839                 next = proc->lwWaitLink;
1840                 while (next && next->lwWaitMode == LW_WAIT_UNTIL_FREE)
1841                 {
1842                         proc = next;
1843                         next = next->lwWaitLink;
1844                 }
1845
1846                 /* proc is now the last PGPROC to be released */
1847                 slot->head = next;
1848                 proc->lwWaitLink = NULL;
1849         }
1850
1851         /* We are done updating shared state of the lock itself. */
1852         SpinLockRelease(&slot->mutex);
1853
1854         /*
1855          * Awaken any waiters I removed from the queue.
1856          */
1857         while (head != NULL)
1858         {
1859                 proc = head;
1860                 head = proc->lwWaitLink;
1861                 proc->lwWaitLink = NULL;
1862                 proc->lwWaiting = false;
1863                 PGSemaphoreUnlock(&proc->sem);
1864         }
1865 }
1866
1867 /*
1868  * Release our insertion slot (or slots, if we're holding them all).
1869  */
1870 static void
1871 WALInsertSlotRelease(void)
1872 {
1873         int                     i;
1874
1875         if (holdingAllSlots)
1876         {
1877                 for (i = 0; i < num_xloginsert_slots; i++)
1878                         WALInsertSlotReleaseOne(i);
1879                 holdingAllSlots = false;
1880         }
1881         else
1882                 WALInsertSlotReleaseOne(MySlotNo);
1883 }
1884
1885 static void
1886 WALInsertSlotReleaseOne(int slotno)
1887 {
1888         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[slotno].slot;
1889         PGPROC     *head;
1890         PGPROC     *proc;
1891
1892         /* Acquire mutex.  Time spent holding mutex should be short! */
1893         SpinLockAcquire(&slot->mutex);
1894
1895         /* we must be holding it */
1896         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1897
1898         slot->xlogInsertingAt = InvalidXLogRecPtr;
1899
1900         /* Release my hold on the slot */
1901         slot->exclusive = 0;
1902         slot->owner = NULL;
1903
1904         /*
1905          * See if I need to awaken any waiters..
1906          */
1907         head = slot->head;
1908         if (head != NULL)
1909         {
1910                 if (slot->releaseOK)
1911                 {
1912                         /*
1913                          * Remove the to-be-awakened PGPROCs from the queue.
1914                          */
1915                         bool            releaseOK = true;
1916
1917                         proc = head;
1918
1919                         /*
1920                          * First wake up any backends that want to be woken up without
1921                          * acquiring the lock. These are always in the front of the queue.
1922                          */
1923                         while (proc->lwWaitMode == LW_WAIT_UNTIL_FREE && proc->lwWaitLink)
1924                                 proc = proc->lwWaitLink;
1925
1926                         /*
1927                          * Awaken the first exclusive-waiter, if any.
1928                          */
1929                         if (proc->lwWaitLink)
1930                         {
1931                                 Assert(proc->lwWaitLink->lwWaitMode == LW_EXCLUSIVE);
1932                                 proc = proc->lwWaitLink;
1933                                 releaseOK = false;
1934                         }
1935                         /* proc is now the last PGPROC to be released */
1936                         slot->head = proc->lwWaitLink;
1937                         proc->lwWaitLink = NULL;
1938
1939                         slot->releaseOK = releaseOK;
1940                 }
1941                 else
1942                         head = NULL;
1943         }
1944
1945         /* We are done updating shared state of the slot itself. */
1946         SpinLockRelease(&slot->mutex);
1947
1948         /*
1949          * Awaken any waiters I removed from the queue.
1950          */
1951         while (head != NULL)
1952         {
1953                 proc = head;
1954                 head = proc->lwWaitLink;
1955                 proc->lwWaitLink = NULL;
1956                 proc->lwWaiting = false;
1957                 PGSemaphoreUnlock(&proc->sem);
1958         }
1959
1960         /*
1961          * Now okay to allow cancel/die interrupts.
1962          */
1963         END_CRIT_SECTION();
1964 }
1965
1966
1967 /*
1968  * Wait for any WAL insertions < upto to finish.
1969  *
1970  * Returns the location of the oldest insertion that is still in-progress.
1971  * Any WAL prior to that point has been fully copied into WAL buffers, and
1972  * can be flushed out to disk. Because this waits for any insertions older
1973  * than 'upto' to finish, the return value is always >= 'upto'.
1974  *
1975  * Note: When you are about to write out WAL, you must call this function
1976  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1977  * need to wait for an insertion to finish (or at least advance to next
1978  * uninitialized page), and the inserter might need to evict an old WAL buffer
1979  * to make room for a new one, which in turn requires WALWriteLock.
1980  */
1981 static XLogRecPtr
1982 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1983 {
1984         uint64          bytepos;
1985         XLogRecPtr      reservedUpto;
1986         XLogRecPtr      finishedUpto;
1987         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1988         int                     i;
1989
1990         if (MyProc == NULL)
1991                 elog(PANIC, "cannot wait without a PGPROC structure");
1992
1993         /* Read the current insert position */
1994         SpinLockAcquire(&Insert->insertpos_lck);
1995         bytepos = Insert->CurrBytePos;
1996         SpinLockRelease(&Insert->insertpos_lck);
1997         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1998
1999         /*
2000          * No-one should request to flush a piece of WAL that hasn't even been
2001          * reserved yet. However, it can happen if there is a block with a bogus
2002          * LSN on disk, for example. XLogFlush checks for that situation and
2003          * complains, but only after the flush. Here we just assume that to mean
2004          * that all WAL that has been reserved needs to be finished. In this
2005          * corner-case, the return value can be smaller than 'upto' argument.
2006          */
2007         if (upto > reservedUpto)
2008         {
2009                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
2010                          (uint32) (upto >> 32), (uint32) upto,
2011                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
2012                 upto = reservedUpto;
2013         }
2014
2015         /*
2016          * finishedUpto is our return value, indicating the point upto which
2017          * all the WAL insertions have been finished. Initialize it to the head
2018          * of reserved WAL, and as we iterate through the insertion slots, back it
2019          * out for any insertion that's still in progress.
2020          */
2021         finishedUpto = reservedUpto;
2022
2023         /*
2024          * Loop through all the slots, sleeping on any in-progress insert older
2025          * than 'upto'.
2026          */
2027         for (i = 0; i < num_xloginsert_slots; i++)
2028         {
2029                 volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
2030                 XLogRecPtr insertingat;
2031
2032         retry:
2033                 /*
2034                  * We can check if the slot is in use without grabbing the spinlock.
2035                  * The spinlock acquisition of insertpos_lck before this loop acts
2036                  * as a memory barrier. If someone acquires the slot after that, it
2037                  * can't possibly be inserting to anything < reservedUpto. If it was
2038                  * acquired before that, an unlocked test will return true.
2039                  */
2040                 if (!slot->exclusive)
2041                         continue;
2042
2043                 SpinLockAcquire(&slot->mutex);
2044                 /* re-check now that we have the lock */
2045                 if (!slot->exclusive)
2046                 {
2047                         SpinLockRelease(&slot->mutex);
2048                         continue;
2049                 }
2050                 insertingat = slot->xlogInsertingAt;
2051                 SpinLockRelease(&slot->mutex);
2052
2053                 if (insertingat == InvalidXLogRecPtr)
2054                 {
2055                         /*
2056                          * slot is reserved just to hold off other inserters, there is no
2057                          * actual insert in progress.
2058                          */
2059                         continue;
2060                 }
2061
2062                 /*
2063                  * This insertion is still in progress. Do we need to wait for it?
2064                  *
2065                  * When an inserter acquires a slot, it doesn't reset 'insertingat', so
2066                  * it will initially point to the old value of some already-finished
2067                  * insertion. The inserter will update the value as soon as it finishes
2068                  * the insertion, moves to the next page, or has to do I/O to flush an
2069                  * old dirty buffer. That means that when we see a slot with
2070                  * insertingat value < upto, we don't know if that insertion is still
2071                  * truly in progress, or if the slot is reused by a new inserter that
2072                  * hasn't updated the insertingat value yet. We have to assume it's the
2073                  * latter, and wait.
2074                  */
2075                 if (insertingat < upto)
2076                 {
2077                         WaitOnSlot(slot, insertingat);
2078                         goto retry;
2079                 }
2080                 else
2081                 {
2082                         /*
2083                          * We don't need to wait for this insertion, but update the
2084                          * return value.
2085                          */
2086                         if (insertingat < finishedUpto)
2087                                 finishedUpto = insertingat;
2088                 }
2089         }
2090         return finishedUpto;
2091 }
2092
2093 /*
2094  * Get a pointer to the right location in the WAL buffer containing the
2095  * given XLogRecPtr.
2096  *
2097  * If the page is not initialized yet, it is initialized. That might require
2098  * evicting an old dirty buffer from the buffer cache, which means I/O.
2099  *
2100  * The caller must ensure that the page containing the requested location
2101  * isn't evicted yet, and won't be evicted. The way to ensure that is to
2102  * hold onto an XLogInsertSlot with the xlogInsertingAt position set to
2103  * something <= ptr. GetXLogBuffer() will update xlogInsertingAt if it needs
2104  * to evict an old page from the buffer. (This means that once you call
2105  * GetXLogBuffer() with a given 'ptr', you must not access anything before
2106  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
2107  * later, because older buffers might be recycled already)
2108  */
2109 static char *
2110 GetXLogBuffer(XLogRecPtr ptr)
2111 {
2112         int                     idx;
2113         XLogRecPtr      endptr;
2114         static uint64 cachedPage = 0;
2115         static char *cachedPos = NULL;
2116         XLogRecPtr      expectedEndPtr;
2117
2118         /*
2119          * Fast path for the common case that we need to access again the same
2120          * page as last time.
2121          */
2122         if (ptr / XLOG_BLCKSZ == cachedPage)
2123         {
2124                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2125                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2126                 return cachedPos + ptr % XLOG_BLCKSZ;
2127         }
2128
2129         /*
2130          * The XLog buffer cache is organized so that a page is always loaded
2131          * to a particular buffer.  That way we can easily calculate the buffer
2132          * a given page must be loaded into, from the XLogRecPtr alone.
2133          */
2134         idx = XLogRecPtrToBufIdx(ptr);
2135
2136         /*
2137          * See what page is loaded in the buffer at the moment. It could be the
2138          * page we're looking for, or something older. It can't be anything newer
2139          * - that would imply the page we're looking for has already been written
2140          * out to disk and evicted, and the caller is responsible for making sure
2141          * that doesn't happen.
2142          *
2143          * However, we don't hold a lock while we read the value. If someone has
2144          * just initialized the page, it's possible that we get a "torn read" of
2145          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
2146          * that case we will see a bogus value. That's ok, we'll grab the mapping
2147          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
2148          * the page we're looking for. But it means that when we do this unlocked
2149          * read, we might see a value that appears to be ahead of the page we're
2150          * looking for. Don't PANIC on that, until we've verified the value while
2151          * holding the lock.
2152          */
2153         expectedEndPtr = ptr;
2154         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
2155
2156         endptr = XLogCtl->xlblocks[idx];
2157         if (expectedEndPtr != endptr)
2158         {
2159                 /*
2160                  * Let others know that we're finished inserting the record up
2161                  * to the page boundary.
2162                  */
2163                 WakeupWaiters(expectedEndPtr - XLOG_BLCKSZ);
2164
2165                 AdvanceXLInsertBuffer(ptr, false);
2166                 endptr = XLogCtl->xlblocks[idx];
2167
2168                 if (expectedEndPtr != endptr)
2169                         elog(PANIC, "could not find WAL buffer for %X/%X",
2170                                  (uint32) (ptr >> 32) , (uint32) ptr);
2171         }
2172         else
2173         {
2174                 /*
2175                  * Make sure the initialization of the page is visible to us, and
2176                  * won't arrive later to overwrite the WAL data we write on the page.
2177                  */
2178                 pg_memory_barrier();
2179         }
2180
2181         /*
2182          * Found the buffer holding this page. Return a pointer to the right
2183          * offset within the page.
2184          */
2185         cachedPage = ptr / XLOG_BLCKSZ;
2186         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
2187
2188         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2189         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2190
2191         return cachedPos + ptr % XLOG_BLCKSZ;
2192 }
2193
2194 /*
2195  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2196  * is the position starting from the beginning of WAL, excluding all WAL
2197  * page headers.
2198  */
2199 static XLogRecPtr
2200 XLogBytePosToRecPtr(uint64 bytepos)
2201 {
2202         uint64          fullsegs;
2203         uint64          fullpages;
2204         uint64          bytesleft;
2205         uint32          seg_offset;
2206         XLogRecPtr      result;
2207
2208         fullsegs = bytepos / UsableBytesInSegment;
2209         bytesleft = bytepos % UsableBytesInSegment;
2210
2211         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2212         {
2213                 /* fits on first page of segment */
2214                 seg_offset = bytesleft + SizeOfXLogLongPHD;
2215         }
2216         else
2217         {
2218                 /* account for the first page on segment with long header */
2219                 seg_offset = XLOG_BLCKSZ;
2220                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2221
2222                 fullpages = bytesleft / UsableBytesInPage;
2223                 bytesleft = bytesleft % UsableBytesInPage;
2224
2225                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2226         }
2227
2228         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2229
2230         return result;
2231 }
2232
2233 /*
2234  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2235  * returns a pointer to the beginning of the page (ie. before page header),
2236  * not to where the first xlog record on that page would go to. This is used
2237  * when converting a pointer to the end of a record.
2238  */
2239 static XLogRecPtr
2240 XLogBytePosToEndRecPtr(uint64 bytepos)
2241 {
2242         uint64          fullsegs;
2243         uint64          fullpages;
2244         uint64          bytesleft;
2245         uint32          seg_offset;
2246         XLogRecPtr      result;
2247
2248         fullsegs = bytepos / UsableBytesInSegment;
2249         bytesleft = bytepos % UsableBytesInSegment;
2250
2251         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2252         {
2253                 /* fits on first page of segment */
2254                 if (bytesleft == 0)
2255                         seg_offset = 0;
2256                 else
2257                         seg_offset = bytesleft + SizeOfXLogLongPHD;
2258         }
2259         else
2260         {
2261                 /* account for the first page on segment with long header */
2262                 seg_offset = XLOG_BLCKSZ;
2263                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2264
2265                 fullpages = bytesleft / UsableBytesInPage;
2266                 bytesleft = bytesleft % UsableBytesInPage;
2267
2268                 if (bytesleft == 0)
2269                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2270                 else
2271                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2272         }
2273
2274         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2275
2276         return result;
2277 }
2278
2279 /*
2280  * Convert an XLogRecPtr to a "usable byte position".
2281  */
2282 static uint64
2283 XLogRecPtrToBytePos(XLogRecPtr ptr)
2284 {
2285         uint64          fullsegs;
2286         uint32          fullpages;
2287         uint32          offset;
2288         uint64          result;
2289
2290         XLByteToSeg(ptr, fullsegs);
2291
2292         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
2293         offset = ptr % XLOG_BLCKSZ;
2294
2295         if (fullpages == 0)
2296         {
2297                 result = fullsegs * UsableBytesInSegment;
2298                 if (offset > 0)
2299                 {
2300                         Assert(offset >= SizeOfXLogLongPHD);
2301                         result += offset - SizeOfXLogLongPHD;
2302                 }
2303         }
2304         else
2305         {
2306                 result = fullsegs * UsableBytesInSegment +
2307                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) +  /* account for first page */
2308                         (fullpages - 1) * UsableBytesInPage; /* full pages */
2309                 if (offset > 0)
2310                 {
2311                         Assert(offset >= SizeOfXLogShortPHD);
2312                         result += offset - SizeOfXLogShortPHD;
2313                 }
2314         }
2315
2316         return result;
2317 }
2318
2319 /*
2320  * Determine whether the buffer referenced by an XLogRecData item has to
2321  * be backed up, and if so fill a BkpBlock struct for it.  In any case
2322  * save the buffer's LSN at *lsn.
2323  */
2324 static bool
2325 XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
2326                                 XLogRecPtr *lsn, BkpBlock *bkpb)
2327 {
2328         Page            page;
2329
2330         page = BufferGetPage(rdata->buffer);
2331
2332         /*
2333          * We assume page LSN is first data on *every* page that can be passed to
2334          * XLogInsert, whether it has the standard page layout or not. We don't
2335          * need to take the buffer header lock for PageGetLSN if we hold an
2336          * exclusive lock on the page and/or the relation.
2337          */
2338         if (holdsExclusiveLock)
2339                 *lsn = PageGetLSN(page);
2340         else
2341                 *lsn = BufferGetLSNAtomic(rdata->buffer);
2342
2343         if (*lsn <= RedoRecPtr)
2344         {
2345                 /*
2346                  * The page needs to be backed up, so set up *bkpb
2347                  */
2348                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
2349
2350                 if (rdata->buffer_std)
2351                 {
2352                         /* Assume we can omit data between pd_lower and pd_upper */
2353                         uint16          lower = ((PageHeader) page)->pd_lower;
2354                         uint16          upper = ((PageHeader) page)->pd_upper;
2355
2356                         if (lower >= SizeOfPageHeaderData &&
2357                                 upper > lower &&
2358                                 upper <= BLCKSZ)
2359                         {
2360                                 bkpb->hole_offset = lower;
2361                                 bkpb->hole_length = upper - lower;
2362                         }
2363                         else
2364                         {
2365                                 /* No "hole" to compress out */
2366                                 bkpb->hole_offset = 0;
2367                                 bkpb->hole_length = 0;
2368                         }
2369                 }
2370                 else
2371                 {
2372                         /* Not a standard page header, don't try to eliminate "hole" */
2373                         bkpb->hole_offset = 0;
2374                         bkpb->hole_length = 0;
2375                 }
2376
2377                 return true;                    /* buffer requires backup */
2378         }
2379
2380         return false;                           /* buffer does not need to be backed up */
2381 }
2382
2383 /*
2384  * Initialize XLOG buffers, writing out old buffers if they still contain
2385  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2386  * true, initialize as many pages as we can without having to write out
2387  * unwritten data. Any new pages are initialized to zeros, with pages headers
2388  * initialized properly.
2389  */
2390 static void
2391 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2392 {
2393         XLogCtlInsert *Insert = &XLogCtl->Insert;
2394         int                     nextidx;
2395         XLogRecPtr      OldPageRqstPtr;
2396         XLogwrtRqst WriteRqst;
2397         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2398         XLogRecPtr      NewPageBeginPtr;
2399         XLogPageHeader NewPage;
2400         int                     npages = 0;
2401
2402         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2403
2404         /*
2405          * Now that we have the lock, check if someone initialized the page
2406          * already.
2407          */
2408         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2409         {
2410                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2411
2412                 /*
2413                  * Get ending-offset of the buffer page we need to replace (this may
2414                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2415                  * already written out.
2416                  */
2417                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2418                 if (LogwrtResult.Write < OldPageRqstPtr)
2419                 {
2420                         /*
2421                          * Nope, got work to do. If we just want to pre-initialize as much
2422                          * as we can without flushing, give up now.
2423                          */
2424                         if (opportunistic)
2425                                 break;
2426
2427                         /* Before waiting, get info_lck and update LogwrtResult */
2428                         {
2429                                 /* use volatile pointer to prevent code rearrangement */
2430                                 volatile XLogCtlData *xlogctl = XLogCtl;
2431
2432                                 SpinLockAcquire(&xlogctl->info_lck);
2433                                 if (xlogctl->LogwrtRqst.Write < OldPageRqstPtr)
2434                                         xlogctl->LogwrtRqst.Write = OldPageRqstPtr;
2435                                 LogwrtResult = xlogctl->LogwrtResult;
2436                                 SpinLockRelease(&xlogctl->info_lck);
2437                         }
2438
2439                         /*
2440                          * Now that we have an up-to-date LogwrtResult value, see if we
2441                          * still need to write it or if someone else already did.
2442                          */
2443                         if (LogwrtResult.Write < OldPageRqstPtr)
2444                         {
2445                                 /*
2446                                  * Must acquire write lock. Release WALBufMappingLock first,
2447                                  * to make sure that all insertions that we need to wait for
2448                                  * can finish (up to this same position). Otherwise we risk
2449                                  * deadlock.
2450                                  */
2451                                 LWLockRelease(WALBufMappingLock);
2452
2453                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2454
2455                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2456
2457                                 LogwrtResult = XLogCtl->LogwrtResult;
2458                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2459                                 {
2460                                         /* OK, someone wrote it already */
2461                                         LWLockRelease(WALWriteLock);
2462                                 }
2463                                 else
2464                                 {
2465                                         /* Have to write it ourselves */
2466                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2467                                         WriteRqst.Write = OldPageRqstPtr;
2468                                         WriteRqst.Flush = 0;
2469                                         XLogWrite(WriteRqst, false);
2470                                         LWLockRelease(WALWriteLock);
2471                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2472                                 }
2473                                 /* Re-acquire WALBufMappingLock and retry */
2474                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2475                                 continue;
2476                         }
2477                 }
2478
2479                 /*
2480                  * Now the next buffer slot is free and we can set it up to be the next
2481                  * output page.
2482                  */
2483                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2484                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2485
2486                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2487
2488                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2489
2490                 /*
2491                  * Be sure to re-zero the buffer so that bytes beyond what we've
2492                  * written will look like zeroes and not valid XLOG records...
2493                  */
2494                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2495
2496                 /*
2497                  * Fill the new page's header
2498                  */
2499                 NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
2500
2501                 /* NewPage->xlp_info = 0; */    /* done by memset */
2502                 NewPage   ->xlp_tli = ThisTimeLineID;
2503                 NewPage   ->xlp_pageaddr = NewPageBeginPtr;
2504                 /* NewPage->xlp_rem_len = 0; */         /* done by memset */
2505
2506                 /*
2507                  * If online backup is not in progress, mark the header to indicate
2508                  * that* WAL records beginning in this page have removable backup
2509                  * blocks.  This allows the WAL archiver to know whether it is safe to
2510                  * compress archived WAL data by transforming full-block records into
2511                  * the non-full-block format.  It is sufficient to record this at the
2512                  * page level because we force a page switch (in fact a segment switch)
2513                  * when starting a backup, so the flag will be off before any records
2514                  * can be written during the backup.  At the end of a backup, the last
2515                  * page will be marked as all unsafe when perhaps only part is unsafe,
2516                  * but at worst the archiver would miss the opportunity to compress a
2517                  * few records.
2518                  */
2519                 if (!Insert->forcePageWrites)
2520                         NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
2521
2522                 /*
2523                  * If first page of an XLOG segment file, make it a long header.
2524                  */
2525                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2526                 {
2527                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2528
2529                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2530                         NewLongPage->xlp_seg_size = XLogSegSize;
2531                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2532                         NewPage   ->xlp_info |= XLP_LONG_HEADER;
2533                 }
2534
2535                 /*
2536                  * Make sure the initialization of the page becomes visible to others
2537                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2538                  * holding a lock.
2539                  */
2540                 pg_write_barrier();
2541
2542                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2543
2544                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2545
2546                 npages++;
2547         }
2548         LWLockRelease(WALBufMappingLock);
2549
2550 #ifdef WAL_DEBUG
2551         if (npages > 0)
2552         {
2553                 elog(DEBUG1, "initialized %d pages, upto %X/%X",
2554                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2555         }
2556 #endif
2557 }
2558
2559 /*
2560  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2561  *
2562  * new_segno indicates a log file that has just been filled up (or read
2563  * during recovery). We measure the distance from RedoRecPtr to new_segno
2564  * and see if that exceeds CheckPointSegments.
2565  *
2566  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2567  */
2568 static bool
2569 XLogCheckpointNeeded(XLogSegNo new_segno)
2570 {
2571         XLogSegNo       old_segno;
2572
2573         XLByteToSeg(RedoRecPtr, old_segno);
2574
2575         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2576                 return true;
2577         return false;
2578 }
2579
2580 /*
2581  * Write and/or fsync the log at least as far as WriteRqst indicates.
2582  *
2583  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2584  * may stop at any convenient boundary (such as a cache or logfile boundary).
2585  * This option allows us to avoid uselessly issuing multiple writes when a
2586  * single one would do.
2587  *
2588  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2589  * must be called before grabbing the lock, to make sure the data is ready to
2590  * write.
2591  */
2592 static void
2593 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2594 {
2595         bool            ispartialpage;
2596         bool            last_iteration;
2597         bool            finishing_seg;
2598         bool            use_existent;
2599         int                     curridx;
2600         int                     npages;
2601         int                     startidx;
2602         uint32          startoffset;
2603
2604         /* We should always be inside a critical section here */
2605         Assert(CritSectionCount > 0);
2606
2607         /*
2608          * Update local LogwrtResult (caller probably did this already, but...)
2609          */
2610         LogwrtResult = XLogCtl->LogwrtResult;
2611
2612         /*
2613          * Since successive pages in the xlog cache are consecutively allocated,
2614          * we can usually gather multiple pages together and issue just one
2615          * write() call.  npages is the number of pages we have determined can be
2616          * written together; startidx is the cache block index of the first one,
2617          * and startoffset is the file offset at which it should go. The latter
2618          * two variables are only valid when npages > 0, but we must initialize
2619          * all of them to keep the compiler quiet.
2620          */
2621         npages = 0;
2622         startidx = 0;
2623         startoffset = 0;
2624
2625         /*
2626          * Within the loop, curridx is the cache block index of the page to
2627          * consider writing.  Begin at the buffer containing the next unwritten
2628          * page, or last partially written page.
2629          */
2630         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2631
2632         while (LogwrtResult.Write < WriteRqst.Write)
2633         {
2634                 /*
2635                  * Make sure we're not ahead of the insert process.  This could happen
2636                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2637                  * last page that's been initialized by AdvanceXLInsertBuffer.
2638                  */
2639                 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2640                 if (LogwrtResult.Write >= EndPtr)
2641                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2642                                  (uint32) (LogwrtResult.Write >> 32),
2643                                  (uint32) LogwrtResult.Write,
2644                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2645
2646                 /* Advance LogwrtResult.Write to end of current buffer page */
2647                 LogwrtResult.Write = EndPtr;
2648                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2649
2650                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2651                 {
2652                         /*
2653                          * Switch to new logfile segment.  We cannot have any pending
2654                          * pages here (since we dump what we have at segment end).
2655                          */
2656                         Assert(npages == 0);
2657                         if (openLogFile >= 0)
2658                                 XLogFileClose();
2659                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2660
2661                         /* create/use new log file */
2662                         use_existent = true;
2663                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2664                         openLogOff = 0;
2665                 }
2666
2667                 /* Make sure we have the current logfile open */
2668                 if (openLogFile < 0)
2669                 {
2670                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2671                         openLogFile = XLogFileOpen(openLogSegNo);
2672                         openLogOff = 0;
2673                 }
2674
2675                 /* Add current page to the set of pending pages-to-dump */
2676                 if (npages == 0)
2677                 {
2678                         /* first of group */
2679                         startidx = curridx;
2680                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2681                 }
2682                 npages++;
2683
2684                 /*
2685                  * Dump the set if this will be the last loop iteration, or if we are
2686                  * at the last page of the cache area (since the next page won't be
2687                  * contiguous in memory), or if we are at the end of the logfile
2688                  * segment.
2689                  */
2690                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2691
2692                 finishing_seg = !ispartialpage &&
2693                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2694
2695                 if (last_iteration ||
2696                         curridx == XLogCtl->XLogCacheBlck ||
2697                         finishing_seg)
2698                 {
2699                         char       *from;
2700                         Size            nbytes;
2701                         Size            nleft;
2702                         int                     written;
2703
2704                         /* Need to seek in the file? */
2705                         if (openLogOff != startoffset)
2706                         {
2707                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2708                                         ereport(PANIC,
2709                                                         (errcode_for_file_access(),
2710                                          errmsg("could not seek in log file %s to offset %u: %m",
2711                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2712                                                         startoffset)));
2713                                 openLogOff = startoffset;
2714                         }
2715
2716                         /* OK to write the page(s) */
2717                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2718                         nbytes = npages * (Size) XLOG_BLCKSZ;
2719                         nleft = nbytes;
2720                         do
2721                         {
2722                                 errno = 0;
2723                                 written  = write(openLogFile, from, nleft);
2724                                 if (written <= 0)
2725                                 {
2726                                         if (errno == EINTR)
2727                                                 continue;
2728                                         ereport(PANIC,
2729                                                         (errcode_for_file_access(),
2730                                                          errmsg("could not write to log file %s "
2731                                                                         "at offset %u, length %lu: %m",
2732                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2733                                                                         openLogOff, (unsigned long) nbytes)));
2734                                 }
2735                                 nleft -= written;
2736                                 from += written;
2737                         } while (nleft > 0);
2738
2739                         /* Update state for write */
2740                         openLogOff += nbytes;
2741                         npages = 0;
2742
2743                         /*
2744                          * If we just wrote the whole last page of a logfile segment,
2745                          * fsync the segment immediately.  This avoids having to go back
2746                          * and re-open prior segments when an fsync request comes along
2747                          * later. Doing it here ensures that one and only one backend will
2748                          * perform this fsync.
2749                          *
2750                          * This is also the right place to notify the Archiver that the
2751                          * segment is ready to copy to archival storage, and to update the
2752                          * timer for archive_timeout, and to signal for a checkpoint if
2753                          * too many logfile segments have been used since the last
2754                          * checkpoint.
2755                          */
2756                         if (finishing_seg)
2757                         {
2758                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2759
2760                                 /* signal that we need to wakeup walsenders later */
2761                                 WalSndWakeupRequest();
2762
2763                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2764
2765                                 if (XLogArchivingActive())
2766                                         XLogArchiveNotifySeg(openLogSegNo);
2767
2768                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2769
2770                                 /*
2771                                  * Request a checkpoint if we've consumed too much xlog since
2772                                  * the last one.  For speed, we first check using the local
2773                                  * copy of RedoRecPtr, which might be out of date; if it looks
2774                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2775                                  * recheck.
2776                                  */
2777                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2778                                 {
2779                                         (void) GetRedoRecPtr();
2780                                         if (XLogCheckpointNeeded(openLogSegNo))
2781                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2782                                 }
2783                         }
2784                 }
2785
2786                 if (ispartialpage)
2787                 {
2788                         /* Only asked to write a partial page */
2789                         LogwrtResult.Write = WriteRqst.Write;
2790                         break;
2791                 }
2792                 curridx = NextBufIdx(curridx);
2793
2794                 /* If flexible, break out of loop as soon as we wrote something */
2795                 if (flexible && npages == 0)
2796                         break;
2797         }
2798
2799         Assert(npages == 0);
2800
2801         /*
2802          * If asked to flush, do so
2803          */
2804         if (LogwrtResult.Flush < WriteRqst.Flush &&
2805                 LogwrtResult.Flush < LogwrtResult.Write)
2806
2807         {
2808                 /*
2809                  * Could get here without iterating above loop, in which case we might
2810                  * have no open file or the wrong one.  However, we do not need to
2811                  * fsync more than one file.
2812                  */
2813                 if (sync_method != SYNC_METHOD_OPEN &&
2814                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2815                 {
2816                         if (openLogFile >= 0 &&
2817                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2818                                 XLogFileClose();
2819                         if (openLogFile < 0)
2820                         {
2821                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2822                                 openLogFile = XLogFileOpen(openLogSegNo);
2823                                 openLogOff = 0;
2824                         }
2825
2826                         issue_xlog_fsync(openLogFile, openLogSegNo);
2827                 }
2828
2829                 /* signal that we need to wakeup walsenders later */
2830                 WalSndWakeupRequest();
2831
2832                 LogwrtResult.Flush = LogwrtResult.Write;
2833         }
2834
2835         /*
2836          * Update shared-memory status
2837          *
2838          * We make sure that the shared 'request' values do not fall behind the
2839          * 'result' values.  This is not absolutely essential, but it saves some
2840          * code in a couple of places.
2841          */
2842         {
2843                 /* use volatile pointer to prevent code rearrangement */
2844                 volatile XLogCtlData *xlogctl = XLogCtl;
2845
2846                 SpinLockAcquire(&xlogctl->info_lck);
2847                 xlogctl->LogwrtResult = LogwrtResult;
2848                 if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
2849                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
2850                 if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
2851                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
2852                 SpinLockRelease(&xlogctl->info_lck);
2853         }
2854 }
2855
2856 /*
2857  * Record the LSN for an asynchronous transaction commit/abort
2858  * and nudge the WALWriter if there is work for it to do.
2859  * (This should not be called for synchronous commits.)
2860  */
2861 void
2862 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2863 {
2864         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2865         bool            sleeping;
2866
2867         /* use volatile pointer to prevent code rearrangement */
2868         volatile XLogCtlData *xlogctl = XLogCtl;
2869
2870         SpinLockAcquire(&xlogctl->info_lck);
2871         LogwrtResult = xlogctl->LogwrtResult;
2872         sleeping = xlogctl->WalWriterSleeping;
2873         if (xlogctl->asyncXactLSN < asyncXactLSN)
2874                 xlogctl->asyncXactLSN = asyncXactLSN;
2875         SpinLockRelease(&xlogctl->info_lck);
2876
2877         /*
2878          * If the WALWriter is sleeping, we should kick it to make it come out of
2879          * low-power mode.      Otherwise, determine whether there's a full page of
2880          * WAL available to write.
2881          */
2882         if (!sleeping)
2883         {
2884                 /* back off to last completed page boundary */
2885                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2886
2887                 /* if we have already flushed that far, we're done */
2888                 if (WriteRqstPtr <= LogwrtResult.Flush)
2889                         return;
2890         }
2891
2892         /*
2893          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2894          * to come out of low-power mode so that this async commit will reach disk
2895          * within the expected amount of time.
2896          */
2897         if (ProcGlobal->walwriterLatch)
2898                 SetLatch(ProcGlobal->walwriterLatch);
2899 }
2900
2901 /*
2902  * Advance minRecoveryPoint in control file.
2903  *
2904  * If we crash during recovery, we must reach this point again before the
2905  * database is consistent.
2906  *
2907  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2908  * is only updated if it's not already greater than or equal to 'lsn'.
2909  */
2910 static void
2911 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2912 {
2913         /* Quick check using our local copy of the variable */
2914         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2915                 return;
2916
2917         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2918
2919         /* update local copy */
2920         minRecoveryPoint = ControlFile->minRecoveryPoint;
2921         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2922
2923         /*
2924          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2925          * i.e., we're doing crash recovery.  We never modify the control file's
2926          * value in that case, so we can short-circuit future checks here too.
2927          */
2928         if (minRecoveryPoint == 0)
2929                 updateMinRecoveryPoint = false;
2930         else if (force || minRecoveryPoint < lsn)
2931         {
2932                 /* use volatile pointer to prevent code rearrangement */
2933                 volatile XLogCtlData *xlogctl = XLogCtl;
2934                 XLogRecPtr      newMinRecoveryPoint;
2935                 TimeLineID      newMinRecoveryPointTLI;
2936
2937                 /*
2938                  * To avoid having to update the control file too often, we update it
2939                  * all the way to the last record being replayed, even though 'lsn'
2940                  * would suffice for correctness.  This also allows the 'force' case
2941                  * to not need a valid 'lsn' value.
2942                  *
2943                  * Another important reason for doing it this way is that the passed
2944                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2945                  * the caller got it from a corrupted heap page.  Accepting such a
2946                  * value as the min recovery point would prevent us from coming up at
2947                  * all.  Instead, we just log a warning and continue with recovery.
2948                  * (See also the comments about corrupt LSNs in XLogFlush.)
2949                  */
2950                 SpinLockAcquire(&xlogctl->info_lck);
2951                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
2952                 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
2953                 SpinLockRelease(&xlogctl->info_lck);
2954
2955                 if (!force && newMinRecoveryPoint < lsn)
2956                         elog(WARNING,
2957                            "xlog min recovery request %X/%X is past current point %X/%X",
2958                                  (uint32) (lsn >> 32), (uint32) lsn,
2959                                  (uint32) (newMinRecoveryPoint >> 32),
2960                                  (uint32) newMinRecoveryPoint);
2961
2962                 /* update control file */
2963                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2964                 {
2965                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2966                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2967                         UpdateControlFile();
2968                         minRecoveryPoint = newMinRecoveryPoint;
2969                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2970
2971                         ereport(DEBUG2,
2972                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
2973                                                 (uint32) (minRecoveryPoint >> 32),
2974                                                 (uint32) minRecoveryPoint,
2975                                                 newMinRecoveryPointTLI)));
2976                 }
2977         }
2978         LWLockRelease(ControlFileLock);
2979 }
2980
2981 /*
2982  * Ensure that all XLOG data through the given position is flushed to disk.
2983  *
2984  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2985  * already held, and we try to avoid acquiring it if possible.
2986  */
2987 void
2988 XLogFlush(XLogRecPtr record)
2989 {
2990         XLogRecPtr      WriteRqstPtr;
2991         XLogwrtRqst WriteRqst;
2992
2993         /*
2994          * During REDO, we are reading not writing WAL.  Therefore, instead of
2995          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2996          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2997          * to act this way too, and because when it tries to write the
2998          * end-of-recovery checkpoint, it should indeed flush.
2999          */
3000         if (!XLogInsertAllowed())
3001         {
3002                 UpdateMinRecoveryPoint(record, false);
3003                 return;
3004         }
3005
3006         /* Quick exit if already known flushed */
3007         if (record <= LogwrtResult.Flush)
3008                 return;
3009
3010 #ifdef WAL_DEBUG
3011         if (XLOG_DEBUG)
3012                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
3013                          (uint32) (record >> 32), (uint32) record,
3014                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3015                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3016 #endif
3017
3018         START_CRIT_SECTION();
3019
3020         /*
3021          * Since fsync is usually a horribly expensive operation, we try to
3022          * piggyback as much data as we can on each fsync: if we see any more data
3023          * entered into the xlog buffer, we'll write and fsync that too, so that
3024          * the final value of LogwrtResult.Flush is as large as possible. This
3025          * gives us some chance of avoiding another fsync immediately after.
3026          */
3027
3028         /* initialize to given target; may increase below */
3029         WriteRqstPtr = record;
3030
3031         /*
3032          * Now wait until we get the write lock, or someone else does the flush
3033          * for us.
3034          */
3035         for (;;)
3036         {
3037                 /* use volatile pointer to prevent code rearrangement */
3038                 volatile XLogCtlData *xlogctl = XLogCtl;
3039                 XLogRecPtr      insertpos;
3040
3041                 /* read LogwrtResult and update local state */
3042                 SpinLockAcquire(&xlogctl->info_lck);
3043                 if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
3044                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3045                 LogwrtResult = xlogctl->LogwrtResult;
3046                 SpinLockRelease(&xlogctl->info_lck);
3047
3048                 /* done already? */
3049                 if (record <= LogwrtResult.Flush)
3050                         break;
3051
3052                 /*
3053                  * Before actually performing the write, wait for all in-flight
3054                  * insertions to the pages we're about to write to finish.
3055                  */
3056                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
3057
3058                 /*
3059                  * Try to get the write lock. If we can't get it immediately, wait
3060                  * until it's released, and recheck if we still need to do the flush
3061                  * or if the backend that held the lock did it for us already. This
3062                  * helps to maintain a good rate of group committing when the system
3063                  * is bottlenecked by the speed of fsyncing.
3064                  */
3065                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
3066                 {
3067                         /*
3068                          * The lock is now free, but we didn't acquire it yet. Before we
3069                          * do, loop back to check if someone else flushed the record for
3070                          * us already.
3071                          */
3072                         continue;
3073                 }
3074
3075                 /* Got the lock; recheck whether request is satisfied */
3076                 LogwrtResult = XLogCtl->LogwrtResult;
3077                 if (record <= LogwrtResult.Flush)
3078                 {
3079                         LWLockRelease(WALWriteLock);
3080                         break;
3081                 }
3082
3083                 /*
3084                  * Sleep before flush! By adding a delay here, we may give further
3085                  * backends the opportunity to join the backlog of group commit
3086                  * followers; this can significantly improve transaction throughput,
3087                  * at the risk of increasing transaction latency.
3088                  *
3089                  * We do not sleep if enableFsync is not turned on, nor if there are
3090                  * fewer than CommitSiblings other backends with active transactions.
3091                  */
3092                 if (CommitDelay > 0 && enableFsync &&
3093                         MinimumActiveBackends(CommitSiblings))
3094                 {
3095                         pg_usleep(CommitDelay);
3096
3097                         /*
3098                          * Re-check how far we can now flush the WAL. It's generally not
3099                          * safe to call WaitXLogInsetionsToFinish while holding
3100                          * WALWriteLock, because an in-progress insertion might need to
3101                          * also grab WALWriteLock to make progress. But we know that all
3102                          * the insertions up to insertpos have already finished, because
3103                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
3104                          * We're only calling it again to allow insertpos to be moved
3105                          * further forward, not to actually wait for anyone.
3106                          */
3107                         insertpos = WaitXLogInsertionsToFinish(insertpos);
3108                 }
3109
3110                 /* try to write/flush later additions to XLOG as well */
3111                 WriteRqst.Write = insertpos;
3112                 WriteRqst.Flush = insertpos;
3113
3114                 XLogWrite(WriteRqst, false);
3115
3116                 LWLockRelease(WALWriteLock);
3117                 /* done */
3118                 break;
3119         }
3120
3121         END_CRIT_SECTION();
3122
3123         /* wake up walsenders now that we've released heavily contended locks */
3124         WalSndWakeupProcessRequests();
3125
3126         /*
3127          * If we still haven't flushed to the request point then we have a
3128          * problem; most likely, the requested flush point is past end of XLOG.
3129          * This has been seen to occur when a disk page has a corrupted LSN.
3130          *
3131          * Formerly we treated this as a PANIC condition, but that hurts the
3132          * system's robustness rather than helping it: we do not want to take down
3133          * the whole system due to corruption on one data page.  In particular, if
3134          * the bad page is encountered again during recovery then we would be
3135          * unable to restart the database at all!  (This scenario actually
3136          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
3137          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3138          * the only time we can reach here during recovery is while flushing the
3139          * end-of-recovery checkpoint record, and we don't expect that to have a
3140          * bad LSN.
3141          *
3142          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3143          * since xact.c calls this routine inside a critical section.  However,
3144          * calls from bufmgr.c are not within critical sections and so we will not
3145          * force a restart for a bad LSN on a data page.
3146          */
3147         if (LogwrtResult.Flush < record)
3148                 elog(ERROR,
3149                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3150                          (uint32) (record >> 32), (uint32) record,
3151                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3152 }
3153
3154 /*
3155  * Flush xlog, but without specifying exactly where to flush to.
3156  *
3157  * We normally flush only completed blocks; but if there is nothing to do on
3158  * that basis, we check for unflushed async commits in the current incomplete
3159  * block, and flush through the latest one of those.  Thus, if async commits
3160  * are not being used, we will flush complete blocks only.      We can guarantee
3161  * that async commits reach disk after at most three cycles; normally only
3162  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
3163  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
3164  * difference only with very high load or long wal_writer_delay, but imposes
3165  * one extra cycle for the worst case for async commits.)
3166  *
3167  * This routine is invoked periodically by the background walwriter process.
3168  *
3169  * Returns TRUE if we flushed anything.
3170  */
3171 bool
3172 XLogBackgroundFlush(void)
3173 {
3174         XLogRecPtr      WriteRqstPtr;
3175         bool            flexible = true;
3176         bool            wrote_something = false;
3177
3178         /* XLOG doesn't need flushing during recovery */
3179         if (RecoveryInProgress())
3180                 return false;
3181
3182         /* read LogwrtResult and update local state */
3183         {
3184                 /* use volatile pointer to prevent code rearrangement */
3185                 volatile XLogCtlData *xlogctl = XLogCtl;
3186
3187                 SpinLockAcquire(&xlogctl->info_lck);
3188                 LogwrtResult = xlogctl->LogwrtResult;
3189                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3190                 SpinLockRelease(&xlogctl->info_lck);
3191         }
3192
3193         /* back off to last completed page boundary */
3194         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
3195
3196         /* if we have already flushed that far, consider async commit records */
3197         if (WriteRqstPtr <= LogwrtResult.Flush)
3198         {
3199                 /* use volatile pointer to prevent code rearrangement */
3200                 volatile XLogCtlData *xlogctl = XLogCtl;
3201
3202                 SpinLockAcquire(&xlogctl->info_lck);
3203                 WriteRqstPtr = xlogctl->asyncXactLSN;
3204                 SpinLockRelease(&xlogctl->info_lck);
3205                 flexible = false;               /* ensure it all gets written */
3206         }
3207
3208         /*
3209          * If already known flushed, we're done. Just need to check if we are
3210          * holding an open file handle to a logfile that's no longer in use,
3211          * preventing the file from being deleted.
3212          */
3213         if (WriteRqstPtr <= LogwrtResult.Flush)
3214         {
3215                 if (openLogFile >= 0)
3216                 {
3217                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
3218                         {
3219                                 XLogFileClose();
3220                         }
3221                 }
3222                 return false;
3223         }
3224
3225 #ifdef WAL_DEBUG
3226         if (XLOG_DEBUG)
3227                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
3228                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
3229                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3230                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3231 #endif
3232
3233         START_CRIT_SECTION();
3234
3235         /* now wait for any in-progress insertions to finish and get write lock */
3236         WaitXLogInsertionsToFinish(WriteRqstPtr);
3237         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3238         LogwrtResult = XLogCtl->LogwrtResult;
3239         if (WriteRqstPtr > LogwrtResult.Flush)
3240         {
3241                 XLogwrtRqst WriteRqst;
3242
3243                 WriteRqst.Write = WriteRqstPtr;
3244                 WriteRqst.Flush = WriteRqstPtr;
3245                 XLogWrite(WriteRqst, flexible);
3246                 wrote_something = true;
3247         }
3248         LWLockRelease(WALWriteLock);
3249
3250         END_CRIT_SECTION();
3251
3252         /* wake up walsenders now that we've released heavily contended locks */
3253         WalSndWakeupProcessRequests();
3254
3255         /*
3256          * Great, done. To take some work off the critical path, try to initialize
3257          * as many of the no-longer-needed WAL buffers for future use as we can.
3258          */
3259         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3260
3261         return wrote_something;
3262 }
3263
3264 /*
3265  * Test whether XLOG data has been flushed up to (at least) the given position.
3266  *
3267  * Returns true if a flush is still needed.  (It may be that someone else
3268  * is already in process of flushing that far, however.)
3269  */
3270 bool
3271 XLogNeedsFlush(XLogRecPtr record)
3272 {
3273         /*
3274          * During recovery, we don't flush WAL but update minRecoveryPoint
3275          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3276          * would need to be updated.
3277          */
3278         if (RecoveryInProgress())
3279         {
3280                 /* Quick exit if already known updated */
3281                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3282                         return false;
3283
3284                 /*
3285                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3286                  * just return a conservative guess.
3287                  */
3288                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3289                         return true;
3290                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3291                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3292                 LWLockRelease(ControlFileLock);
3293
3294                 /*
3295                  * An invalid minRecoveryPoint means that we need to recover all the
3296                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3297                  * file's value in that case, so we can short-circuit future checks
3298                  * here too.
3299                  */
3300                 if (minRecoveryPoint == 0)
3301                         updateMinRecoveryPoint = false;
3302
3303                 /* check again */
3304                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3305                         return false;
3306                 else
3307                         return true;
3308         }
3309
3310         /* Quick exit if already known flushed */
3311         if (record <= LogwrtResult.Flush)
3312                 return false;
3313
3314         /* read LogwrtResult and update local state */
3315         {
3316                 /* use volatile pointer to prevent code rearrangement */
3317                 volatile XLogCtlData *xlogctl = XLogCtl;
3318
3319                 SpinLockAcquire(&xlogctl->info_lck);
3320                 LogwrtResult = xlogctl->LogwrtResult;
3321                 SpinLockRelease(&xlogctl->info_lck);
3322         }
3323
3324         /* check again */
3325         if (record <= LogwrtResult.Flush)
3326                 return false;
3327
3328         return true;
3329 }
3330
3331 /*
3332  * Create a new XLOG file segment, or open a pre-existing one.
3333  *
3334  * log, seg: identify segment to be created/opened.
3335  *
3336  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3337  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
3338  * file was used.
3339  *
3340  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3341  * place.  This should be TRUE except during bootstrap log creation.  The
3342  * caller must *not* hold the lock at call.
3343  *
3344  * Returns FD of opened file.
3345  *
3346  * Note: errors here are ERROR not PANIC because we might or might not be
3347  * inside a critical section (eg, during checkpoint there is no reason to
3348  * take down the system on failure).  They will promote to PANIC if we are
3349  * in a critical section.
3350  */
3351 int
3352 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3353 {
3354         char            path[MAXPGPATH];
3355         char            tmppath[MAXPGPATH];
3356         char       *zbuffer;
3357         XLogSegNo       installed_segno;
3358         int                     max_advance;
3359         int                     fd;
3360         int                     nbytes;
3361
3362         XLogFilePath(path, ThisTimeLineID, logsegno);
3363
3364         /*
3365          * Try to use existent file (checkpoint maker may have created it already)
3366          */
3367         if (*use_existent)
3368         {
3369                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3370                                                    S_IRUSR | S_IWUSR);
3371                 if (fd < 0)
3372                 {
3373                         if (errno != ENOENT)
3374                                 ereport(ERROR,
3375                                                 (errcode_for_file_access(),
3376                                                  errmsg("could not open file \"%s\": %m", path)));
3377                 }
3378                 else
3379                         return fd;
3380         }
3381
3382         /*
3383          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3384          * another process is doing the same thing.  If so, we will end up
3385          * pre-creating an extra log segment.  That seems OK, and better than
3386          * holding the lock throughout this lengthy process.
3387          */
3388         elog(DEBUG2, "creating and filling new WAL file");
3389
3390         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3391
3392         unlink(tmppath);
3393
3394         /*
3395          * Allocate a buffer full of zeros. This is done before opening the file
3396          * so that we don't leak the file descriptor if palloc fails.
3397          *
3398          * Note: palloc zbuffer, instead of just using a local char array, to
3399          * ensure it is reasonably well-aligned; this may save a few cycles
3400          * transferring data to the kernel.
3401          */
3402         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
3403
3404         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3405         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3406                                            S_IRUSR | S_IWUSR);
3407         if (fd < 0)
3408                 ereport(ERROR,
3409                                 (errcode_for_file_access(),
3410                                  errmsg("could not create file \"%s\": %m", tmppath)));
3411
3412         /*
3413          * Zero-fill the file.  We have to do this the hard way to ensure that all
3414          * the file space has really been allocated --- on platforms that allow
3415          * "holes" in files, just seeking to the end doesn't allocate intermediate
3416          * space.  This way, we know that we have all the space and (after the
3417          * fsync below) that all the indirect blocks are down on disk.  Therefore,
3418          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3419          * log file.
3420          */
3421         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3422         {
3423                 errno = 0;
3424                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3425                 {
3426                         int                     save_errno = errno;
3427
3428                         /*
3429                          * If we fail to make the file, delete it to release disk space
3430                          */
3431                         unlink(tmppath);
3432
3433                         close(fd);
3434
3435                         /* if write didn't set errno, assume problem is no disk space */
3436                         errno = save_errno ? save_errno : ENOSPC;
3437
3438                         ereport(ERROR,
3439                                         (errcode_for_file_access(),
3440                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3441                 }
3442         }
3443         pfree(zbuffer);
3444
3445         if (pg_fsync(fd) != 0)
3446         {
3447                 close(fd);
3448                 ereport(ERROR,
3449                                 (errcode_for_file_access(),
3450                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3451         }
3452
3453         if (close(fd))
3454                 ereport(ERROR,
3455                                 (errcode_for_file_access(),
3456                                  errmsg("could not close file \"%s\": %m", tmppath)));
3457
3458         /*
3459          * Now move the segment into place with its final name.
3460          *
3461          * If caller didn't want to use a pre-existing file, get rid of any
3462          * pre-existing file.  Otherwise, cope with possibility that someone else
3463          * has created the file while we were filling ours: if so, use ours to
3464          * pre-create a future log segment.
3465          */
3466         installed_segno = logsegno;
3467         max_advance = XLOGfileslop;
3468         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3469                                                                 *use_existent, &max_advance,
3470                                                                 use_lock))
3471         {
3472                 /*
3473                  * No need for any more future segments, or InstallXLogFileSegment()
3474                  * failed to rename the file into place. If the rename failed, opening
3475                  * the file below will fail.
3476                  */
3477                 unlink(tmppath);
3478         }
3479
3480         /* Set flag to tell caller there was no existent file */
3481         *use_existent = false;
3482
3483         /* Now open original target segment (might not be file I just made) */
3484         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3485                                            S_IRUSR | S_IWUSR);
3486         if (fd < 0)
3487                 ereport(ERROR,
3488                                 (errcode_for_file_access(),
3489                                  errmsg("could not open file \"%s\": %m", path)));
3490
3491         elog(DEBUG2, "done creating and filling new WAL file");
3492
3493         return fd;
3494 }
3495
3496 /*
3497  * Create a new XLOG file segment by copying a pre-existing one.
3498  *
3499  * destsegno: identify segment to be created.
3500  *
3501  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
3502  *              a different timeline)
3503  *
3504  * Currently this is only used during recovery, and so there are no locking
3505  * considerations.      But we should be just as tense as XLogFileInit to avoid
3506  * emplacing a bogus file.
3507  */
3508 static void
3509 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
3510 {
3511         char            path[MAXPGPATH];
3512         char            tmppath[MAXPGPATH];
3513         char            buffer[XLOG_BLCKSZ];
3514         int                     srcfd;
3515         int                     fd;
3516         int                     nbytes;
3517
3518         /*
3519          * Open the source file
3520          */
3521         XLogFilePath(path, srcTLI, srcsegno);
3522         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3523         if (srcfd < 0)
3524                 ereport(ERROR,
3525                                 (errcode_for_file_access(),
3526                                  errmsg("could not open file \"%s\": %m", path)));
3527
3528         /*
3529          * Copy into a temp file name.
3530          */
3531         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3532
3533         unlink(tmppath);
3534
3535         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3536         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3537                                                    S_IRUSR | S_IWUSR);
3538         if (fd < 0)
3539                 ereport(ERROR,
3540                                 (errcode_for_file_access(),
3541                                  errmsg("could not create file \"%s\": %m", tmppath)));
3542
3543         /*
3544          * Do the data copying.
3545          */
3546         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3547         {
3548                 errno = 0;
3549                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3550                 {
3551                         if (errno != 0)
3552                                 ereport(ERROR,
3553                                                 (errcode_for_file_access(),
3554                                                  errmsg("could not read file \"%s\": %m", path)));
3555                         else
3556                                 ereport(ERROR,
3557                                                 (errmsg("not enough data in file \"%s\"", path)));
3558                 }
3559                 errno = 0;
3560                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3561                 {
3562                         int                     save_errno = errno;
3563
3564                         /*
3565                          * If we fail to make the file, delete it to release disk space
3566                          */
3567                         unlink(tmppath);
3568                         /* if write didn't set errno, assume problem is no disk space */
3569                         errno = save_errno ? save_errno : ENOSPC;
3570
3571                         ereport(ERROR,
3572                                         (errcode_for_file_access(),
3573                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3574                 }
3575         }
3576
3577         if (pg_fsync(fd) != 0)
3578                 ereport(ERROR,
3579                                 (errcode_for_file_access(),
3580                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3581
3582         if (CloseTransientFile(fd))
3583                 ereport(ERROR,
3584                                 (errcode_for_file_access(),
3585                                  errmsg("could not close file \"%s\": %m", tmppath)));
3586
3587         CloseTransientFile(srcfd);
3588
3589         /*
3590          * Now move the segment into place with its final name.
3591          */
3592         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
3593                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3594 }
3595
3596 /*
3597  * Install a new XLOG segment file as a current or future log segment.
3598  *
3599  * This is used both to install a newly-created segment (which has a temp
3600  * filename while it's being created) and to recycle an old segment.
3601  *
3602  * *segno: identify segment to install as (or first possible target).
3603  * When find_free is TRUE, this is modified on return to indicate the
3604  * actual installation location or last segment searched.
3605  *
3606  * tmppath: initial name of file to install.  It will be renamed into place.
3607  *
3608  * find_free: if TRUE, install the new segment at the first empty segno
3609  * number at or after the passed numbers.  If FALSE, install the new segment
3610  * exactly where specified, deleting any existing segment file there.
3611  *
3612  * *max_advance: maximum number of segno slots to advance past the starting
3613  * point.  Fail if no free slot is found in this range.  On return, reduced
3614  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
3615  * when find_free is FALSE.)
3616  *
3617  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3618  * place.  This should be TRUE except during bootstrap log creation.  The
3619  * caller must *not* hold the lock at call.
3620  *
3621  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3622  * max_advance limit was exceeded, or an error occurred while renaming the
3623  * file into place.
3624  */
3625 static bool
3626 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3627                                            bool find_free, int *max_advance,
3628                                            bool use_lock)
3629 {
3630         char            path[MAXPGPATH];
3631         struct stat stat_buf;
3632
3633         XLogFilePath(path, ThisTimeLineID, *segno);
3634
3635         /*
3636          * We want to be sure that only one process does this at a time.
3637          */
3638         if (use_lock)
3639                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3640
3641         if (!find_free)
3642         {
3643                 /* Force installation: get rid of any pre-existing segment file */
3644                 unlink(path);
3645         }
3646         else
3647         {
3648                 /* Find a free slot to put it in */
3649                 while (stat(path, &stat_buf) == 0)
3650                 {
3651                         if (*max_advance <= 0)
3652                         {
3653                                 /* Failed to find a free slot within specified range */
3654                                 if (use_lock)
3655                                         LWLockRelease(ControlFileLock);
3656                                 return false;
3657                         }
3658                         (*segno)++;
3659                         (*max_advance)--;
3660                         XLogFilePath(path, ThisTimeLineID, *segno);
3661                 }
3662         }
3663
3664         /*
3665          * Prefer link() to rename() here just to be really sure that we don't
3666          * overwrite an existing logfile.  However, there shouldn't be one, so
3667          * rename() is an acceptable substitute except for the truly paranoid.
3668          */
3669 #if HAVE_WORKING_LINK
3670         if (link(tmppath, path) < 0)
3671         {
3672                 if (use_lock)
3673                         LWLockRelease(ControlFileLock);
3674                 ereport(LOG,
3675                                 (errcode_for_file_access(),
3676                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
3677                                                 tmppath, path)));
3678                 return false;
3679         }
3680         unlink(tmppath);
3681 #else
3682         if (rename(tmppath, path) < 0)
3683         {
3684                 if (use_lock)
3685                         LWLockRelease(ControlFileLock);
3686                 ereport(LOG,
3687                                 (errcode_for_file_access(),
3688                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
3689                                                 tmppath, path)));
3690                 return false;
3691         }
3692 #endif
3693
3694         if (use_lock)
3695                 LWLockRelease(ControlFileLock);
3696
3697         return true;
3698 }
3699
3700 /*
3701  * Open a pre-existing logfile segment for writing.
3702  */
3703 int
3704 XLogFileOpen(XLogSegNo segno)
3705 {
3706         char            path[MAXPGPATH];
3707         int                     fd;
3708
3709         XLogFilePath(path, ThisTimeLineID, segno);
3710
3711         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3712                                            S_IRUSR | S_IWUSR);
3713         if (fd < 0)
3714                 ereport(PANIC,
3715                                 (errcode_for_file_access(),
3716                                  errmsg("could not open transaction log file \"%s\": %m", path)));
3717
3718         return fd;
3719 }
3720
3721 /*
3722  * Open a logfile segment for reading (during recovery).
3723  *
3724  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3725  * Otherwise, it's assumed to be already available in pg_xlog.
3726  */
3727 static int
3728 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3729                          int source, bool notfoundOk)
3730 {
3731         char            xlogfname[MAXFNAMELEN];
3732         char            activitymsg[MAXFNAMELEN + 16];
3733         char            path[MAXPGPATH];
3734         int                     fd;
3735
3736         XLogFileName(xlogfname, tli, segno);
3737
3738         switch (source)
3739         {
3740                 case XLOG_FROM_ARCHIVE:
3741                         /* Report recovery progress in PS display */
3742                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3743                                          xlogfname);
3744                         set_ps_display(activitymsg, false);
3745
3746                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3747                                                                                                           "RECOVERYXLOG",
3748                                                                                                           XLogSegSize,
3749                                                                                                           InRedo);
3750                         if (!restoredFromArchive)
3751                                 return -1;
3752                         break;
3753
3754                 case XLOG_FROM_PG_XLOG:
3755                 case XLOG_FROM_STREAM:
3756                         XLogFilePath(path, tli, segno);
3757                         restoredFromArchive = false;
3758                         break;
3759
3760                 default:
3761                         elog(ERROR, "invalid XLogFileRead source %d", source);
3762         }
3763
3764         /*
3765          * If the segment was fetched from archival storage, replace the existing
3766          * xlog segment (if any) with the archival version.
3767          */
3768         if (source == XLOG_FROM_ARCHIVE)
3769         {
3770                 KeepFileRestoredFromArchive(path, xlogfname);
3771
3772                 /*
3773                  * Set path to point at the new file in pg_xlog.
3774                  */
3775                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3776         }
3777
3778         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3779         if (fd >= 0)
3780         {
3781                 /* Success! */
3782                 curFileTLI = tli;
3783
3784                 /* Report recovery progress in PS display */
3785                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3786                                  xlogfname);
3787                 set_ps_display(activitymsg, false);
3788
3789                 /* Track source of data in assorted state variables */
3790                 readSource = source;
3791                 XLogReceiptSource = source;
3792                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3793                 if (source != XLOG_FROM_STREAM)
3794                         XLogReceiptTime = GetCurrentTimestamp();
3795
3796                 return fd;
3797         }
3798         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3799                 ereport(PANIC,
3800                                 (errcode_for_file_access(),
3801                                  errmsg("could not open file \"%s\": %m", path)));
3802         return -1;
3803 }
3804
3805 /*
3806  * Open a logfile segment for reading (during recovery).
3807  *
3808  * This version searches for the segment with any TLI listed in expectedTLEs.
3809  */
3810 static int
3811 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3812 {
3813         char            path[MAXPGPATH];
3814         ListCell   *cell;
3815         int                     fd;
3816         List       *tles;
3817
3818         /*
3819          * Loop looking for a suitable timeline ID: we might need to read any of
3820          * the timelines listed in expectedTLEs.
3821          *
3822          * We expect curFileTLI on entry to be the TLI of the preceding file in
3823          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3824          * to go backwards; this prevents us from picking up the wrong file when a
3825          * parent timeline extends to higher segment numbers than the child we
3826          * want to read.
3827          *
3828          * If we haven't read the timeline history file yet, read it now, so that
3829          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3830          * however, unless we actually find a valid segment.  That way if there is
3831          * neither a timeline history file nor a WAL segment in the archive, and
3832          * streaming replication is set up, we'll read the timeline history file
3833          * streamed from the master when we start streaming, instead of recovering
3834          * with a dummy history generated here.
3835          */
3836         if (expectedTLEs)
3837                 tles = expectedTLEs;
3838         else
3839                 tles = readTimeLineHistory(recoveryTargetTLI);
3840
3841         foreach(cell, tles)
3842         {
3843                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3844
3845                 if (tli < curFileTLI)
3846                         break;                          /* don't bother looking at too-old TLIs */
3847
3848                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3849                 {
3850                         fd = XLogFileRead(segno, emode, tli,
3851                                                           XLOG_FROM_ARCHIVE, true);
3852                         if (fd != -1)
3853                         {
3854                                 elog(DEBUG1, "got WAL segment from archive");
3855                                 if (!expectedTLEs)
3856                                         expectedTLEs = tles;
3857                                 return fd;
3858                         }
3859                 }
3860
3861                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3862                 {
3863                         fd = XLogFileRead(segno, emode, tli,
3864                                                           XLOG_FROM_PG_XLOG, true);
3865                         if (fd != -1)
3866                         {
3867                                 if (!expectedTLEs)
3868                                         expectedTLEs = tles;
3869                                 return fd;
3870                         }
3871                 }
3872         }
3873
3874         /* Couldn't find it.  For simplicity, complain about front timeline */
3875         XLogFilePath(path, recoveryTargetTLI, segno);
3876         errno = ENOENT;
3877         ereport(emode,
3878                         (errcode_for_file_access(),
3879                          errmsg("could not open file \"%s\": %m", path)));
3880         return -1;
3881 }
3882
3883 /*
3884  * Close the current logfile segment for writing.
3885  */
3886 static void
3887 XLogFileClose(void)
3888 {
3889         Assert(openLogFile >= 0);
3890
3891         /*
3892          * WAL segment files will not be re-read in normal operation, so we advise
3893          * the OS to release any cached pages.  But do not do so if WAL archiving
3894          * or streaming is active, because archiver and walsender process could
3895          * use the cache to read the WAL segment.
3896          */
3897 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3898         if (!XLogIsNeeded())
3899                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3900 #endif
3901
3902         if (close(openLogFile))
3903                 ereport(PANIC,
3904                                 (errcode_for_file_access(),
3905                                  errmsg("could not close log file %s: %m",
3906                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3907         openLogFile = -1;
3908 }
3909
3910 /*
3911  * Preallocate log files beyond the specified log endpoint.
3912  *
3913  * XXX this is currently extremely conservative, since it forces only one
3914  * future log segment to exist, and even that only if we are 75% done with
3915  * the current one.  This is only appropriate for very low-WAL-volume systems.
3916  * High-volume systems will be OK once they've built up a sufficient set of
3917  * recycled log segments, but the startup transient is likely to include
3918  * a lot of segment creations by foreground processes, which is not so good.
3919  */
3920 static void
3921 PreallocXlogFiles(XLogRecPtr endptr)
3922 {
3923         XLogSegNo       _logSegNo;
3924         int                     lf;
3925         bool            use_existent;
3926
3927         XLByteToPrevSeg(endptr, _logSegNo);
3928         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3929         {
3930                 _logSegNo++;
3931                 use_existent = true;
3932                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3933                 close(lf);
3934                 if (!use_existent)
3935                         CheckpointStats.ckpt_segs_added++;
3936         }
3937 }
3938
3939 /*
3940  * Throws an error if the given log segment has already been removed or
3941  * recycled. The caller should only pass a segment that it knows to have
3942  * existed while the server has been running, as this function always
3943  * succeeds if no WAL segments have been removed since startup.
3944  * 'tli' is only used in the error message.
3945  */
3946 void
3947 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3948 {
3949         /* use volatile pointer to prevent code rearrangement */
3950         volatile XLogCtlData *xlogctl = XLogCtl;
3951         XLogSegNo       lastRemovedSegNo;
3952
3953         SpinLockAcquire(&xlogctl->info_lck);
3954         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
3955         SpinLockRelease(&xlogctl->info_lck);
3956
3957         if (segno <= lastRemovedSegNo)
3958         {
3959                 char            filename[MAXFNAMELEN];
3960
3961                 XLogFileName(filename, tli, segno);
3962                 ereport(ERROR,
3963                                 (errcode_for_file_access(),
3964                                  errmsg("requested WAL segment %s has already been removed",
3965                                                 filename)));
3966         }
3967 }
3968
3969 /*
3970  * Update the last removed segno pointer in shared memory, to reflect
3971  * that the given XLOG file has been removed.
3972  */
3973 static void
3974 UpdateLastRemovedPtr(char *filename)
3975 {
3976         /* use volatile pointer to prevent code rearrangement */
3977         volatile XLogCtlData *xlogctl = XLogCtl;
3978         uint32          tli;
3979         XLogSegNo       segno;
3980
3981         XLogFromFileName(filename, &tli, &segno);
3982
3983         SpinLockAcquire(&xlogctl->info_lck);
3984         if (segno > xlogctl->lastRemovedSegNo)
3985                 xlogctl->lastRemovedSegNo = segno;
3986         SpinLockRelease(&xlogctl->info_lck);
3987 }
3988
3989 /*
3990  * Recycle or remove all log files older or equal to passed segno
3991  *
3992  * endptr is current (or recent) end of xlog; this is used to determine
3993  * whether we want to recycle rather than delete no-longer-wanted log files.
3994  */
3995 static void
3996 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
3997 {
3998         XLogSegNo       endlogSegNo;
3999         int                     max_advance;
4000         DIR                *xldir;
4001         struct dirent *xlde;
4002         char            lastoff[MAXFNAMELEN];
4003         char            path[MAXPGPATH];
4004
4005 #ifdef WIN32
4006         char            newpath[MAXPGPATH];
4007 #endif
4008         struct stat statbuf;
4009
4010         /*
4011          * Initialize info about where to try to recycle to.  We allow recycling
4012          * segments up to XLOGfileslop segments beyond the current XLOG location.
4013          */
4014         XLByteToPrevSeg(endptr, endlogSegNo);
4015         max_advance = XLOGfileslop;
4016
4017         xldir = AllocateDir(XLOGDIR);
4018         if (xldir == NULL)
4019                 ereport(ERROR,
4020                                 (errcode_for_file_access(),
4021                                  errmsg("could not open transaction log directory \"%s\": %m",
4022                                                 XLOGDIR)));
4023
4024         /*
4025          * Construct a filename of the last segment to be kept. The timeline ID
4026          * doesn't matter, we ignore that in the comparison. (During recovery,
4027          * ThisTimeLineID isn't set, so we can't use that.)
4028          */
4029         XLogFileName(lastoff, 0, segno);
4030
4031         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4032                  lastoff);
4033
4034         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4035         {
4036                 /*
4037                  * We ignore the timeline part of the XLOG segment identifiers in
4038                  * deciding whether a segment is still needed.  This ensures that we
4039                  * won't prematurely remove a segment from a parent timeline. We could
4040                  * probably be a little more proactive about removing segments of
4041                  * non-parent timelines, but that would be a whole lot more
4042                  * complicated.
4043                  *
4044                  * We use the alphanumeric sorting property of the filenames to decide
4045                  * which ones are earlier than the lastoff segment.
4046                  */
4047                 if (strlen(xlde->d_name) == 24 &&
4048                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4049                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4050                 {
4051                         if (XLogArchiveCheckDone(xlde->d_name))
4052                         {
4053                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4054
4055                                 /* Update the last removed location in shared memory first */
4056                                 UpdateLastRemovedPtr(xlde->d_name);
4057
4058                                 /*
4059                                  * Before deleting the file, see if it can be recycled as a
4060                                  * future log segment. Only recycle normal files, pg_standby
4061                                  * for example can create symbolic links pointing to a
4062                                  * separate archive directory.
4063                                  */
4064                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4065                                         InstallXLogFileSegment(&endlogSegNo, path,
4066                                                                                    true, &max_advance, true))
4067                                 {
4068                                         ereport(DEBUG2,
4069                                                         (errmsg("recycled transaction log file \"%s\"",
4070                                                                         xlde->d_name)));
4071                                         CheckpointStats.ckpt_segs_recycled++;
4072                                         /* Needn't recheck that slot on future iterations */
4073                                         if (max_advance > 0)
4074                                         {
4075                                                 endlogSegNo++;
4076                                                 max_advance--;
4077                                         }
4078                                 }
4079                                 else
4080                                 {
4081                                         /* No need for any more future segments... */
4082                                         int                     rc;
4083
4084                                         ereport(DEBUG2,
4085                                                         (errmsg("removing transaction log file \"%s\"",
4086                                                                         xlde->d_name)));
4087
4088 #ifdef WIN32
4089
4090                                         /*
4091                                          * On Windows, if another process (e.g another backend)
4092                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
4093                                          * will succeed, but the file will still show up in
4094                                          * directory listing until the last handle is closed. To
4095                                          * avoid confusing the lingering deleted file for a live
4096                                          * WAL file that needs to be archived, rename it before
4097                                          * deleting it.
4098                                          *
4099                                          * If another process holds the file open without
4100                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
4101                                          * again at the next checkpoint.
4102                                          */
4103                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4104                                         if (rename(path, newpath) != 0)
4105                                         {
4106                                                 ereport(LOG,
4107                                                                 (errcode_for_file_access(),
4108                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
4109                                                                                 path)));
4110                                                 continue;
4111                                         }
4112                                         rc = unlink(newpath);
4113 #else
4114                                         rc = unlink(path);
4115 #endif
4116                                         if (rc != 0)
4117                                         {
4118                                                 ereport(LOG,
4119                                                                 (errcode_for_file_access(),
4120                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
4121                                                                                 path)));
4122                                                 continue;
4123                                         }
4124                                         CheckpointStats.ckpt_segs_removed++;
4125                                 }
4126
4127                                 XLogArchiveCleanup(xlde->d_name);
4128                         }
4129                 }
4130         }
4131
4132         FreeDir(xldir);
4133 }
4134
4135 /*
4136  * Verify whether pg_xlog and pg_xlog/archive_status exist.
4137  * If the latter does not exist, recreate it.
4138  *
4139  * It is not the goal of this function to verify the contents of these
4140  * directories, but to help in cases where someone has performed a cluster
4141  * copy for PITR purposes but omitted pg_xlog from the copy.
4142  *
4143  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
4144  * policy decision was made not to.  It is fairly common for pg_xlog to be
4145  * a symlink, and if that was the DBA's intent then automatically making a
4146  * plain directory would result in degraded performance with no notice.
4147  */
4148 static void
4149 ValidateXLOGDirectoryStructure(void)
4150 {
4151         char            path[MAXPGPATH];
4152         struct stat stat_buf;
4153
4154         /* Check for pg_xlog; if it doesn't exist, error out */
4155         if (stat(XLOGDIR, &stat_buf) != 0 ||
4156                 !S_ISDIR(stat_buf.st_mode))
4157                 ereport(FATAL,
4158                                 (errmsg("required WAL directory \"%s\" does not exist",
4159                                                 XLOGDIR)));
4160
4161         /* Check for archive_status */
4162         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4163         if (stat(path, &stat_buf) == 0)
4164         {
4165                 /* Check for weird cases where it exists but isn't a directory */
4166                 if (!S_ISDIR(stat_buf.st_mode))
4167                         ereport(FATAL,
4168                                         (errmsg("required WAL directory \"%s\" does not exist",
4169                                                         path)));
4170         }
4171         else
4172         {
4173                 ereport(LOG,
4174                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4175                 if (mkdir(path, S_IRWXU) < 0)
4176                         ereport(FATAL,
4177                                         (errmsg("could not create missing directory \"%s\": %m",
4178                                                         path)));
4179         }
4180 }
4181
4182 /*
4183  * Remove previous backup history files.  This also retries creation of
4184  * .ready files for any backup history files for which XLogArchiveNotify
4185  * failed earlier.
4186  */
4187 static void
4188 CleanupBackupHistory(void)
4189 {
4190         DIR                *xldir;
4191         struct dirent *xlde;
4192         char            path[MAXPGPATH];
4193
4194         xldir = AllocateDir(XLOGDIR);
4195         if (xldir == NULL)
4196                 ereport(ERROR,
4197                                 (errcode_for_file_access(),
4198                                  errmsg("could not open transaction log directory \"%s\": %m",
4199                                                 XLOGDIR)));
4200
4201         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4202         {
4203                 if (strlen(xlde->d_name) > 24 &&
4204                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4205                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
4206                                    ".backup") == 0)
4207                 {
4208                         if (XLogArchiveCheckDone(xlde->d_name))
4209                         {
4210                                 ereport(DEBUG2,
4211                                 (errmsg("removing transaction log backup history file \"%s\"",
4212                                                 xlde->d_name)));
4213                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4214                                 unlink(path);
4215                                 XLogArchiveCleanup(xlde->d_name);
4216                         }
4217                 }
4218         }
4219
4220         FreeDir(xldir);
4221 }
4222
4223 /*
4224  * Restore a full-page image from a backup block attached to an XLOG record.
4225  *
4226  * lsn: LSN of the XLOG record being replayed
4227  * record: the complete XLOG record
4228  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
4229  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
4230  * keep_buffer: TRUE to return the buffer still locked and pinned
4231  *
4232  * Returns the buffer number containing the page.  Note this is not terribly
4233  * useful unless keep_buffer is specified as TRUE.
4234  *
4235  * Note: when a backup block is available in XLOG, we restore it
4236  * unconditionally, even if the page in the database appears newer.
4237  * This is to protect ourselves against database pages that were partially
4238  * or incorrectly written during a crash.  We assume that the XLOG data
4239  * must be good because it has passed a CRC check, while the database
4240  * page might not be.  This will force us to replay all subsequent
4241  * modifications of the page that appear in XLOG, rather than possibly
4242  * ignoring them as already applied, but that's not a huge drawback.
4243  *
4244  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
4245  * else a normal exclusive lock is used.  During crash recovery, that's just
4246  * pro forma because there can't be any regular backends in the system, but
4247  * in hot standby mode the distinction is important.
4248  *
4249  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
4250  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
4251  * is needed in some cases when replaying XLOG records that touch multiple
4252  * pages, to prevent inconsistent states from being visible to other backends.
4253  * (Again, that's only important in hot standby mode.)
4254  */
4255 Buffer
4256 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
4257                                    bool get_cleanup_lock, bool keep_buffer)
4258 {
4259         BkpBlock        bkpb;
4260         char       *blk;
4261         int                     i;
4262
4263         /* Locate requested BkpBlock in the record */
4264         blk = (char *) XLogRecGetData(record) + record->xl_len;
4265         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
4266         {
4267                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
4268                         continue;
4269
4270                 memcpy(&bkpb, blk, sizeof(BkpBlock));
4271                 blk += sizeof(BkpBlock);
4272
4273                 if (i == block_index)
4274                 {
4275                         /* Found it, apply the update */
4276                         return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
4277                                                                                           keep_buffer);
4278                 }
4279
4280                 blk += BLCKSZ - bkpb.hole_length;
4281         }
4282
4283         /* Caller specified a bogus block_index */
4284         elog(ERROR, "failed to restore block_index %d", block_index);
4285         return InvalidBuffer;           /* keep compiler quiet */
4286 }
4287
4288 /*
4289  * Workhorse for RestoreBackupBlock usable without an xlog record
4290  *
4291  * Restores a full-page image from BkpBlock and a data pointer.
4292  */
4293 static Buffer
4294 RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
4295                                                    bool get_cleanup_lock, bool keep_buffer)
4296 {
4297         Buffer          buffer;
4298         Page            page;
4299
4300         buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
4301                                                                         RBM_ZERO);
4302         Assert(BufferIsValid(buffer));
4303         if (get_cleanup_lock)
4304                 LockBufferForCleanup(buffer);
4305         else
4306                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4307
4308         page = (Page) BufferGetPage(buffer);
4309
4310         if (bkpb.hole_length == 0)
4311         {
4312                 memcpy((char *) page, blk, BLCKSZ);
4313         }
4314         else
4315         {
4316                 memcpy((char *) page, blk, bkpb.hole_offset);
4317                 /* must zero-fill the hole */
4318                 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
4319                 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
4320                            blk + bkpb.hole_offset,
4321                            BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
4322         }
4323
4324         /*
4325          * The checksum value on this page is currently invalid. We don't need to
4326          * reset it here since it will be set before being written.
4327          */
4328
4329         PageSetLSN(page, lsn);
4330         MarkBufferDirty(buffer);
4331
4332         if (!keep_buffer)
4333                 UnlockReleaseBuffer(buffer);
4334
4335         return buffer;
4336 }
4337
4338 /*
4339  * Attempt to read an XLOG record.
4340  *
4341  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
4342  * try to read a record just after the last one previously read.
4343  *
4344  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4345  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4346  * record is available.
4347  *
4348  * The record is copied into readRecordBuf, so that on successful return,
4349  * the returned record pointer always points there.
4350  */
4351 static XLogRecord *
4352 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4353                    bool fetching_ckpt)
4354 {
4355         XLogRecord *record;
4356         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4357
4358         /* Pass through parameters to XLogPageRead */
4359         private->fetching_ckpt = fetching_ckpt;
4360         private->emode = emode;
4361         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4362
4363         /* This is the first attempt to read this page. */
4364         lastSourceFailed = false;
4365
4366         for (;;)
4367         {
4368                 char       *errormsg;
4369
4370                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4371                 ReadRecPtr = xlogreader->ReadRecPtr;
4372                 EndRecPtr = xlogreader->EndRecPtr;
4373                 if (record == NULL)
4374                 {
4375                         if (readFile >= 0)
4376                         {
4377                                 close(readFile);
4378                                 readFile = -1;
4379                         }
4380
4381                         /*
4382                          * We only end up here without a message when XLogPageRead()
4383                          * failed - in that case we already logged something. In
4384                          * StandbyMode that only happens if we have been triggered, so we
4385                          * shouldn't loop anymore in that case.
4386                          */
4387                         if (errormsg)
4388                                 ereport(emode_for_corrupt_record(emode,
4389                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4390                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4391                 }
4392
4393                 /*
4394                  * Check page TLI is one of the expected values.
4395                  */
4396                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4397                 {
4398                         char            fname[MAXFNAMELEN];
4399                         XLogSegNo       segno;
4400                         int32           offset;
4401
4402                         XLByteToSeg(xlogreader->latestPagePtr, segno);
4403                         offset = xlogreader->latestPagePtr % XLogSegSize;
4404                         XLogFileName(fname, xlogreader->readPageTLI, segno);
4405                         ereport(emode_for_corrupt_record(emode,
4406                                                                                          RecPtr ? RecPtr : EndRecPtr),
4407                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4408                                         xlogreader->latestPageTLI,
4409                                         fname,
4410                                         offset)));
4411                         record = NULL;
4412                 }
4413
4414                 if (record)
4415                 {
4416                         /* Great, got a record */
4417                         return record;
4418                 }
4419                 else
4420                 {
4421                         /* No valid record available from this source */
4422                         lastSourceFailed = true;
4423
4424                         /*
4425                          * If archive recovery was requested, but we were still doing
4426                          * crash recovery, switch to archive recovery and retry using the
4427                          * offline archive. We have now replayed all the valid WAL in
4428                          * pg_xlog, so we are presumably now consistent.
4429                          *
4430                          * We require that there's at least some valid WAL present in
4431                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
4432                          * from the archive, even if pg_xlog is completely empty, but we'd
4433                          * have no idea how far we'd have to replay to reach consistency.
4434                          * So err on the safe side and give up.
4435                          */
4436                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4437                                 !fetching_ckpt)
4438                         {
4439                                 ereport(DEBUG1,
4440                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
4441                                 InArchiveRecovery = true;
4442                                 if (StandbyModeRequested)
4443                                         StandbyMode = true;
4444
4445                                 /* initialize minRecoveryPoint to this record */
4446                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4447                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4448                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4449                                 {
4450                                         ControlFile->minRecoveryPoint = EndRecPtr;
4451                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4452                                 }
4453                                 /* update local copy */
4454                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4455                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4456
4457                                 UpdateControlFile();
4458                                 LWLockRelease(ControlFileLock);
4459
4460                                 CheckRecoveryConsistency();
4461
4462                                 /*
4463                                  * Before we retry, reset lastSourceFailed and currentSource
4464                                  * so that we will check the archive next.
4465                                  */
4466                                 lastSourceFailed = false;
4467                                 currentSource = 0;
4468
4469                                 continue;
4470                         }
4471
4472                         /* In standby mode, loop back to retry. Otherwise, give up. */
4473                         if (StandbyMode && !CheckForStandbyTrigger())
4474                                 continue;
4475                         else
4476                                 return NULL;
4477                 }
4478         }
4479 }
4480
4481 /*
4482  * Scan for new timelines that might have appeared in the archive since we
4483  * started recovery.
4484  *
4485  * If there are any, the function changes recovery target TLI to the latest
4486  * one and returns 'true'.
4487  */
4488 static bool
4489 rescanLatestTimeLine(void)
4490 {
4491         List       *newExpectedTLEs;
4492         bool            found;
4493         ListCell   *cell;
4494         TimeLineID      newtarget;
4495         TimeLineID      oldtarget = recoveryTargetTLI;
4496         TimeLineHistoryEntry *currentTle = NULL;
4497
4498         newtarget = findNewestTimeLine(recoveryTargetTLI);
4499         if (newtarget == recoveryTargetTLI)
4500         {
4501                 /* No new timelines found */
4502                 return false;
4503         }
4504
4505         /*
4506          * Determine the list of expected TLIs for the new TLI
4507          */
4508
4509         newExpectedTLEs = readTimeLineHistory(newtarget);
4510
4511         /*
4512          * If the current timeline is not part of the history of the new timeline,
4513          * we cannot proceed to it.
4514          */
4515         found = false;
4516         foreach(cell, newExpectedTLEs)
4517         {
4518                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4519
4520                 if (currentTle->tli == recoveryTargetTLI)
4521                 {
4522                         found = true;
4523                         break;
4524                 }
4525         }
4526         if (!found)
4527         {
4528                 ereport(LOG,
4529                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4530                                                 newtarget,
4531                                                 ThisTimeLineID)));
4532                 return false;
4533         }
4534
4535         /*
4536          * The current timeline was found in the history file, but check that the
4537          * next timeline was forked off from it *after* the current recovery
4538          * location.
4539          */
4540         if (currentTle->end < EndRecPtr)
4541         {
4542                 ereport(LOG,
4543                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4544                                                 newtarget,
4545                                                 ThisTimeLineID,
4546                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4547                 return false;
4548         }
4549
4550         /* The new timeline history seems valid. Switch target */
4551         recoveryTargetTLI = newtarget;
4552         list_free_deep(expectedTLEs);
4553         expectedTLEs = newExpectedTLEs;
4554
4555         /*
4556          * As in StartupXLOG(), try to ensure we have all the history files
4557          * between the old target and new target in pg_xlog.
4558          */
4559         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4560
4561         ereport(LOG,
4562                         (errmsg("new target timeline is %u",
4563                                         recoveryTargetTLI)));
4564
4565         return true;
4566 }
4567
4568 /*
4569  * I/O routines for pg_control
4570  *
4571  * *ControlFile is a buffer in shared memory that holds an image of the
4572  * contents of pg_control.      WriteControlFile() initializes pg_control
4573  * given a preloaded buffer, ReadControlFile() loads the buffer from
4574  * the pg_control file (during postmaster or standalone-backend startup),
4575  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4576  *
4577  * For simplicity, WriteControlFile() initializes the fields of pg_control
4578  * that are related to checking backend/database compatibility, and
4579  * ReadControlFile() verifies they are correct.  We could split out the
4580  * I/O and compatibility-check functions, but there seems no need currently.
4581  */
4582 static void
4583 WriteControlFile(void)
4584 {
4585         int                     fd;
4586         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4587
4588         /*
4589          * Initialize version and compatibility-check fields
4590          */
4591         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4592         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4593
4594         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4595         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4596
4597         ControlFile->blcksz = BLCKSZ;
4598         ControlFile->relseg_size = RELSEG_SIZE;
4599         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4600         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4601
4602         ControlFile->nameDataLen = NAMEDATALEN;
4603         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4604
4605         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4606
4607 #ifdef HAVE_INT64_TIMESTAMP
4608         ControlFile->enableIntTimes = true;
4609 #else
4610         ControlFile->enableIntTimes = false;
4611 #endif
4612         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4613         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4614
4615         /* Contents are protected with a CRC */
4616         INIT_CRC32(ControlFile->crc);
4617         COMP_CRC32(ControlFile->crc,
4618                            (char *) ControlFile,
4619                            offsetof(ControlFileData, crc));
4620         FIN_CRC32(ControlFile->crc);
4621
4622         /*
4623          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4624          * excess over sizeof(ControlFileData).  This reduces the odds of
4625          * premature-EOF errors when reading pg_control.  We'll still fail when we
4626          * check the contents of the file, but hopefully with a more specific
4627          * error than "couldn't read pg_control".
4628          */
4629         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4630                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4631
4632         memset(buffer, 0, PG_CONTROL_SIZE);
4633         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4634
4635         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4636                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4637                                            S_IRUSR | S_IWUSR);
4638         if (fd < 0)
4639                 ereport(PANIC,
4640                                 (errcode_for_file_access(),
4641                                  errmsg("could not create control file \"%s\": %m",
4642                                                 XLOG_CONTROL_FILE)));
4643
4644         errno = 0;
4645         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4646         {
4647                 /* if write didn't set errno, assume problem is no disk space */
4648                 if (errno == 0)
4649                         errno = ENOSPC;
4650                 ereport(PANIC,
4651                                 (errcode_for_file_access(),
4652                                  errmsg("could not write to control file: %m")));
4653         }
4654
4655         if (pg_fsync(fd) != 0)
4656                 ereport(PANIC,
4657                                 (errcode_for_file_access(),
4658                                  errmsg("could not fsync control file: %m")));
4659
4660         if (close(fd))
4661                 ereport(PANIC,
4662                                 (errcode_for_file_access(),
4663                                  errmsg("could not close control file: %m")));
4664 }
4665
4666 static void
4667 ReadControlFile(void)
4668 {
4669         pg_crc32        crc;
4670         int                     fd;
4671
4672         /*
4673          * Read data...
4674          */
4675         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4676                                            O_RDWR | PG_BINARY,
4677                                            S_IRUSR | S_IWUSR);
4678         if (fd < 0)
4679                 ereport(PANIC,
4680                                 (errcode_for_file_access(),
4681                                  errmsg("could not open control file \"%s\": %m",
4682                                                 XLOG_CONTROL_FILE)));
4683
4684         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4685                 ereport(PANIC,
4686                                 (errcode_for_file_access(),
4687                                  errmsg("could not read from control file: %m")));
4688
4689         close(fd);
4690
4691         /*
4692          * Check for expected pg_control format version.  If this is wrong, the
4693          * CRC check will likely fail because we'll be checking the wrong number
4694          * of bytes.  Complaining about wrong version will probably be more
4695          * enlightening than complaining about wrong CRC.
4696          */
4697
4698         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4699                 ereport(FATAL,
4700                                 (errmsg("database files are incompatible with server"),
4701                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4702                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4703                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4704                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4705                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4706
4707         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4708                 ereport(FATAL,
4709                                 (errmsg("database files are incompatible with server"),
4710                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4711                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4712                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4713                                  errhint("It looks like you need to initdb.")));
4714
4715         /* Now check the CRC. */
4716         INIT_CRC32(crc);
4717         COMP_CRC32(crc,
4718                            (char *) ControlFile,
4719                            offsetof(ControlFileData, crc));
4720         FIN_CRC32(crc);
4721
4722         if (!EQ_CRC32(crc, ControlFile->crc))
4723                 ereport(FATAL,
4724                                 (errmsg("incorrect checksum in control file")));
4725
4726         /*
4727          * Do compatibility checking immediately.  If the database isn't
4728          * compatible with the backend executable, we want to abort before we can
4729          * possibly do any damage.
4730          */
4731         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4732                 ereport(FATAL,
4733                                 (errmsg("database files are incompatible with server"),
4734                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4735                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4736                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4737                                  errhint("It looks like you need to initdb.")));
4738         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4739                 ereport(FATAL,
4740                                 (errmsg("database files are incompatible with server"),
4741                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4742                                          " but the server was compiled with MAXALIGN %d.",
4743                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4744                                  errhint("It looks like you need to initdb.")));
4745         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4746                 ereport(FATAL,
4747                                 (errmsg("database files are incompatible with server"),
4748                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4749                                  errhint("It looks like you need to initdb.")));
4750         if (ControlFile->blcksz != BLCKSZ)
4751                 ereport(FATAL,
4752                                 (errmsg("database files are incompatible with server"),
4753                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4754                                            " but the server was compiled with BLCKSZ %d.",
4755                                            ControlFile->blcksz, BLCKSZ),
4756                                  errhint("It looks like you need to recompile or initdb.")));
4757         if (ControlFile->relseg_size != RELSEG_SIZE)
4758                 ereport(FATAL,
4759                                 (errmsg("database files are incompatible with server"),
4760                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4761                                   " but the server was compiled with RELSEG_SIZE %d.",
4762                                   ControlFile->relseg_size, RELSEG_SIZE),
4763                                  errhint("It looks like you need to recompile or initdb.")));
4764         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4765                 ereport(FATAL,
4766                                 (errmsg("database files are incompatible with server"),
4767                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4768                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4769                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4770                                  errhint("It looks like you need to recompile or initdb.")));
4771         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4772                 ereport(FATAL,
4773                                 (errmsg("database files are incompatible with server"),
4774                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4775                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4776                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4777                                  errhint("It looks like you need to recompile or initdb.")));
4778         if (ControlFile->nameDataLen != NAMEDATALEN)
4779                 ereport(FATAL,
4780                                 (errmsg("database files are incompatible with server"),
4781                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4782                                   " but the server was compiled with NAMEDATALEN %d.",
4783                                   ControlFile->nameDataLen, NAMEDATALEN),
4784                                  errhint("It looks like you need to recompile or initdb.")));
4785         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4786                 ereport(FATAL,
4787                                 (errmsg("database files are incompatible with server"),
4788                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4789                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4790                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4791                                  errhint("It looks like you need to recompile or initdb.")));
4792         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4793                 ereport(FATAL,
4794                                 (errmsg("database files are incompatible with server"),
4795                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4796                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4797                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4798                                  errhint("It looks like you need to recompile or initdb.")));
4799
4800 #ifdef HAVE_INT64_TIMESTAMP
4801         if (ControlFile->enableIntTimes != true)
4802                 ereport(FATAL,
4803                                 (errmsg("database files are incompatible with server"),
4804                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4805                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4806                                  errhint("It looks like you need to recompile or initdb.")));
4807 #else
4808         if (ControlFile->enableIntTimes != false)
4809                 ereport(FATAL,
4810                                 (errmsg("database files are incompatible with server"),
4811                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4812                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4813                                  errhint("It looks like you need to recompile or initdb.")));
4814 #endif
4815
4816 #ifdef USE_FLOAT4_BYVAL
4817         if (ControlFile->float4ByVal != true)
4818                 ereport(FATAL,
4819                                 (errmsg("database files are incompatible with server"),
4820                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4821                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4822                                  errhint("It looks like you need to recompile or initdb.")));
4823 #else
4824         if (ControlFile->float4ByVal != false)
4825                 ereport(FATAL,
4826                                 (errmsg("database files are incompatible with server"),
4827                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4828                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4829                                  errhint("It looks like you need to recompile or initdb.")));
4830 #endif
4831
4832 #ifdef USE_FLOAT8_BYVAL
4833         if (ControlFile->float8ByVal != true)
4834                 ereport(FATAL,
4835                                 (errmsg("database files are incompatible with server"),
4836                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4837                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4838                                  errhint("It looks like you need to recompile or initdb.")));
4839 #else
4840         if (ControlFile->float8ByVal != false)
4841                 ereport(FATAL,
4842                                 (errmsg("database files are incompatible with server"),
4843                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4844                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4845                                  errhint("It looks like you need to recompile or initdb.")));
4846 #endif
4847
4848         /* Make the fixed  settings visible as GUC variables, too */
4849         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4850                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4851 }
4852
4853 void
4854 UpdateControlFile(void)
4855 {
4856         int                     fd;
4857
4858         INIT_CRC32(ControlFile->crc);
4859         COMP_CRC32(ControlFile->crc,
4860                            (char *) ControlFile,
4861                            offsetof(ControlFileData, crc));
4862         FIN_CRC32(ControlFile->crc);
4863
4864         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4865                                            O_RDWR | PG_BINARY,
4866                                            S_IRUSR | S_IWUSR);
4867         if (fd < 0)
4868                 ereport(PANIC,
4869                                 (errcode_for_file_access(),
4870                                  errmsg("could not open control file \"%s\": %m",
4871                                                 XLOG_CONTROL_FILE)));
4872
4873         errno = 0;
4874         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4875         {
4876                 /* if write didn't set errno, assume problem is no disk space */
4877                 if (errno == 0)
4878                         errno = ENOSPC;
4879                 ereport(PANIC,
4880                                 (errcode_for_file_access(),
4881                                  errmsg("could not write to control file: %m")));
4882         }
4883
4884         if (pg_fsync(fd) != 0)
4885                 ereport(PANIC,
4886                                 (errcode_for_file_access(),
4887                                  errmsg("could not fsync control file: %m")));
4888
4889         if (close(fd))
4890                 ereport(PANIC,
4891                                 (errcode_for_file_access(),
4892                                  errmsg("could not close control file: %m")));
4893 }
4894
4895 /*
4896  * Returns the unique system identifier from control file.
4897  */
4898 uint64
4899 GetSystemIdentifier(void)
4900 {
4901         Assert(ControlFile != NULL);
4902         return ControlFile->system_identifier;
4903 }
4904
4905 /*
4906  * Are checksums enabled for data pages?
4907  */
4908 bool
4909 DataChecksumsEnabled(void)
4910 {
4911         Assert(ControlFile != NULL);
4912         return (ControlFile->data_checksum_version > 0);
4913 }
4914
4915 /*
4916  * Returns a fake LSN for unlogged relations.
4917  *
4918  * Each call generates an LSN that is greater than any previous value
4919  * returned. The current counter value is saved and restored across clean
4920  * shutdowns, but like unlogged relations, does not survive a crash. This can
4921  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4922  * LSN-like increasing sequence of numbers without writing any WAL.
4923  */
4924 XLogRecPtr
4925 GetFakeLSNForUnloggedRel(void)
4926 {
4927         XLogRecPtr      nextUnloggedLSN;
4928
4929         /* use volatile pointer to prevent code rearrangement */
4930         volatile XLogCtlData *xlogctl = XLogCtl;
4931
4932         /* increment the unloggedLSN counter, need SpinLock */
4933         SpinLockAcquire(&xlogctl->ulsn_lck);
4934         nextUnloggedLSN = xlogctl->unloggedLSN++;
4935         SpinLockRelease(&xlogctl->ulsn_lck);
4936
4937         return nextUnloggedLSN;
4938 }
4939
4940 /*
4941  * Auto-tune the number of XLOG buffers.
4942  *
4943  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4944  * a maximum of one XLOG segment (there is little reason to think that more
4945  * is helpful, at least so long as we force an fsync when switching log files)
4946  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4947  * 9.1, when auto-tuning was added).
4948  *
4949  * This should not be called until NBuffers has received its final value.
4950  */
4951 static int
4952 XLOGChooseNumBuffers(void)
4953 {
4954         int                     xbuffers;
4955
4956         xbuffers = NBuffers / 32;
4957         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4958                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4959         if (xbuffers < 8)
4960                 xbuffers = 8;
4961         return xbuffers;
4962 }
4963
4964 /*
4965  * GUC check_hook for wal_buffers
4966  */
4967 bool
4968 check_wal_buffers(int *newval, void **extra, GucSource source)
4969 {
4970         /*
4971          * -1 indicates a request for auto-tune.
4972          */
4973         if (*newval == -1)
4974         {
4975                 /*
4976                  * If we haven't yet changed the boot_val default of -1, just let it
4977                  * be.  We'll fix it when XLOGShmemSize is called.
4978                  */
4979                 if (XLOGbuffers == -1)
4980                         return true;
4981
4982                 /* Otherwise, substitute the auto-tune value */
4983                 *newval = XLOGChooseNumBuffers();
4984         }
4985
4986         /*
4987          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4988          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4989          * the case, we just silently treat such values as a request for the
4990          * minimum.  (We could throw an error instead, but that doesn't seem very
4991          * helpful.)
4992          */
4993         if (*newval < 4)
4994                 *newval = 4;
4995
4996         return true;
4997 }
4998
4999 /*
5000  * Initialization of shared memory for XLOG
5001  */
5002 Size
5003 XLOGShmemSize(void)
5004 {
5005         Size            size;
5006
5007         /*
5008          * If the value of wal_buffers is -1, use the preferred auto-tune value.
5009          * This isn't an amazingly clean place to do this, but we must wait till
5010          * NBuffers has received its final value, and must do it before using the
5011          * value of XLOGbuffers to do anything important.
5012          */
5013         if (XLOGbuffers == -1)
5014         {
5015                 char            buf[32];
5016
5017                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5018                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5019         }
5020         Assert(XLOGbuffers > 0);
5021
5022         /* XLogCtl */
5023         size = sizeof(XLogCtlData);
5024
5025         /* xlog insertion slots, plus alignment */
5026         size = add_size(size, mul_size(sizeof(XLogInsertSlotPadded), num_xloginsert_slots + 1));
5027         /* xlblocks array */
5028         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5029         /* extra alignment padding for XLOG I/O buffers */
5030         size = add_size(size, XLOG_BLCKSZ);
5031         /* and the buffers themselves */
5032         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5033
5034         /*
5035          * Note: we don't count ControlFileData, it comes out of the "slop factor"
5036          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5037          * routine again below to compute the actual allocation size.
5038          */
5039
5040         return size;
5041 }
5042
5043 void
5044 XLOGShmemInit(void)
5045 {
5046         bool            foundCFile,
5047                                 foundXLog;
5048         char       *allocptr;
5049         int                     i;
5050
5051         ControlFile = (ControlFileData *)
5052                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5053         XLogCtl = (XLogCtlData *)
5054                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5055
5056         if (foundCFile || foundXLog)
5057         {
5058                 /* both should be present or neither */
5059                 Assert(foundCFile && foundXLog);
5060                 return;
5061         }
5062         memset(XLogCtl, 0, sizeof(XLogCtlData));
5063
5064         /*
5065          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5066          * multiple of the alignment for same, so no extra alignment padding is
5067          * needed here.
5068          */
5069         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5070         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5071         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5072         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5073
5074         /* Xlog insertion slots. Ensure they're aligned to the full padded size */
5075         allocptr += sizeof(XLogInsertSlotPadded) -
5076                 ((uintptr_t) allocptr) % sizeof(XLogInsertSlotPadded);
5077         XLogCtl->Insert.insertSlots = (XLogInsertSlotPadded *) allocptr;
5078         allocptr += sizeof(XLogInsertSlotPadded) * num_xloginsert_slots;
5079
5080         /*
5081          * Align the start of the page buffers to a full xlog block size boundary.
5082          * This simplifies some calculations in XLOG insertion. It is also required
5083          * for O_DIRECT.
5084          */
5085         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5086         XLogCtl->pages = allocptr;
5087         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5088
5089         /*
5090          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5091          * in additional info.)
5092          */
5093         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5094         XLogCtl->SharedRecoveryInProgress = true;
5095         XLogCtl->SharedHotStandbyActive = false;
5096         XLogCtl->WalWriterSleeping = false;
5097
5098         for (i = 0; i < num_xloginsert_slots; i++)
5099         {
5100                 XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
5101                 SpinLockInit(&slot->mutex);
5102                 slot->xlogInsertingAt = InvalidXLogRecPtr;
5103                 slot->owner = NULL;
5104
5105                 slot->releaseOK = true;
5106                 slot->exclusive = 0;
5107                 slot->head = NULL;
5108                 slot->tail = NULL;
5109         }
5110
5111         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5112         SpinLockInit(&XLogCtl->info_lck);
5113         SpinLockInit(&XLogCtl->ulsn_lck);
5114         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5115
5116         /*
5117          * If we are not in bootstrap mode, pg_control should already exist. Read
5118          * and validate it immediately (see comments in ReadControlFile() for the
5119          * reasons why).
5120          */
5121         if (!IsBootstrapProcessingMode())
5122                 ReadControlFile();
5123 }
5124
5125 /*
5126  * This func must be called ONCE on system install.  It creates pg_control
5127  * and the initial XLOG segment.
5128  */
5129 void
5130 BootStrapXLOG(void)
5131 {
5132         CheckPoint      checkPoint;
5133         char       *buffer;
5134         XLogPageHeader page;
5135         XLogLongPageHeader longpage;
5136         XLogRecord *record;
5137         bool            use_existent;
5138         uint64          sysidentifier;
5139         struct timeval tv;
5140         pg_crc32        crc;
5141
5142         /*
5143          * Select a hopefully-unique system identifier code for this installation.
5144          * We use the result of gettimeofday(), including the fractional seconds
5145          * field, as being about as unique as we can easily get.  (Think not to
5146          * use random(), since it hasn't been seeded and there's no portable way
5147          * to seed it other than the system clock value...)  The upper half of the
5148          * uint64 value is just the tv_sec part, while the lower half is the XOR
5149          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
5150          * unnecessarily if "uint64" is really only 32 bits wide.  A person
5151          * knowing this encoding can determine the initialization time of the
5152          * installation, which could perhaps be useful sometimes.
5153          */
5154         gettimeofday(&tv, NULL);
5155         sysidentifier = ((uint64) tv.tv_sec) << 32;
5156         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
5157
5158         /* First timeline ID is always 1 */
5159         ThisTimeLineID = 1;
5160
5161         /* page buffer must be aligned suitably for O_DIRECT */
5162         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5163         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5164         memset(page, 0, XLOG_BLCKSZ);
5165
5166         /*
5167          * Set up information for the initial checkpoint record
5168          *
5169          * The initial checkpoint record is written to the beginning of the WAL
5170          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5171          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5172          */
5173         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
5174         checkPoint.ThisTimeLineID = ThisTimeLineID;
5175         checkPoint.PrevTimeLineID = ThisTimeLineID;
5176         checkPoint.fullPageWrites = fullPageWrites;
5177         checkPoint.nextXidEpoch = 0;
5178         checkPoint.nextXid = FirstNormalTransactionId;
5179         checkPoint.nextOid = FirstBootstrapObjectId;
5180         checkPoint.nextMulti = FirstMultiXactId;
5181         checkPoint.nextMultiOffset = 0;
5182         checkPoint.oldestXid = FirstNormalTransactionId;
5183         checkPoint.oldestXidDB = TemplateDbOid;
5184         checkPoint.oldestMulti = FirstMultiXactId;
5185         checkPoint.oldestMultiDB = TemplateDbOid;
5186         checkPoint.time = (pg_time_t) time(NULL);
5187         checkPoint.oldestActiveXid = InvalidTransactionId;
5188
5189         ShmemVariableCache->nextXid = checkPoint.nextXid;
5190         ShmemVariableCache->nextOid = checkPoint.nextOid;
5191         ShmemVariableCache->oidCount = 0;
5192         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5193         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5194         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
5195
5196         /* Set up the XLOG page header */
5197         page->xlp_magic = XLOG_PAGE_MAGIC;
5198         page->xlp_info = XLP_LONG_HEADER;
5199         page->xlp_tli = ThisTimeLineID;
5200         page->xlp_pageaddr = XLogSegSize;
5201         longpage = (XLogLongPageHeader) page;
5202         longpage->xlp_sysid = sysidentifier;
5203         longpage->xlp_seg_size = XLogSegSize;
5204         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5205
5206         /* Insert the initial checkpoint record */
5207         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
5208         record->xl_prev = 0;
5209         record->xl_xid = InvalidTransactionId;
5210         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
5211         record->xl_len = sizeof(checkPoint);
5212         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5213         record->xl_rmid = RM_XLOG_ID;
5214         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
5215
5216         INIT_CRC32(crc);
5217         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
5218         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5219         FIN_CRC32(crc);
5220         record->xl_crc = crc;
5221
5222         /* Create first XLOG segment file */
5223         use_existent = false;
5224         openLogFile = XLogFileInit(1, &use_existent, false);
5225
5226         /* Write the first page with the initial record */
5227         errno = 0;
5228         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5229         {
5230                 /* if write didn't set errno, assume problem is no disk space */
5231                 if (errno == 0)
5232                         errno = ENOSPC;
5233                 ereport(PANIC,
5234                                 (errcode_for_file_access(),
5235                           errmsg("could not write bootstrap transaction log file: %m")));
5236         }
5237
5238         if (pg_fsync(openLogFile) != 0)
5239                 ereport(PANIC,
5240                                 (errcode_for_file_access(),
5241                           errmsg("could not fsync bootstrap transaction log file: %m")));
5242
5243         if (close(openLogFile))
5244                 ereport(PANIC,
5245                                 (errcode_for_file_access(),
5246                           errmsg("could not close bootstrap transaction log file: %m")));
5247
5248         openLogFile = -1;
5249
5250         /* Now create pg_control */
5251
5252         memset(ControlFile, 0, sizeof(ControlFileData));
5253         /* Initialize pg_control status fields */
5254         ControlFile->system_identifier = sysidentifier;
5255         ControlFile->state = DB_SHUTDOWNED;
5256         ControlFile->time = checkPoint.time;
5257         ControlFile->checkPoint = checkPoint.redo;
5258         ControlFile->checkPointCopy = checkPoint;
5259         ControlFile->unloggedLSN = 1;
5260
5261         /* Set important parameter values for use when replaying WAL */
5262         ControlFile->MaxConnections = MaxConnections;
5263         ControlFile->max_worker_processes = max_worker_processes;
5264         ControlFile->max_prepared_xacts = max_prepared_xacts;
5265         ControlFile->max_locks_per_xact = max_locks_per_xact;
5266         ControlFile->wal_level = wal_level;
5267         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5268
5269         /* some additional ControlFile fields are set in WriteControlFile() */
5270
5271         WriteControlFile();
5272
5273         /* Bootstrap the commit log, too */
5274         BootStrapCLOG();
5275         BootStrapSUBTRANS();
5276         BootStrapMultiXact();
5277
5278         pfree(buffer);
5279 }
5280
5281 static char *
5282 str_time(pg_time_t tnow)
5283 {
5284         static char buf[128];
5285
5286         pg_strftime(buf, sizeof(buf),
5287                                 "%Y-%m-%d %H:%M:%S %Z",
5288                                 pg_localtime(&tnow, log_timezone));
5289
5290         return buf;
5291 }
5292
5293 /*
5294  * See if there is a recovery command file (recovery.conf), and if so
5295  * read in parameters for archive recovery and XLOG streaming.
5296  *
5297  * The file is parsed using the main configuration parser.
5298  */
5299 static void
5300 readRecoveryCommandFile(void)
5301 {
5302         FILE       *fd;
5303         TimeLineID      rtli = 0;
5304         bool            rtliGiven = false;
5305         ConfigVariable *item,
5306                            *head = NULL,
5307                            *tail = NULL;
5308
5309         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5310         if (fd == NULL)
5311         {
5312                 if (errno == ENOENT)
5313                         return;                         /* not there, so no archive recovery */
5314                 ereport(FATAL,
5315                                 (errcode_for_file_access(),
5316                                  errmsg("could not open recovery command file \"%s\": %m",
5317                                                 RECOVERY_COMMAND_FILE)));
5318         }
5319
5320         /*
5321          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5322          * no need to check the return value.
5323          */
5324         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5325
5326         FreeFile(fd);
5327
5328         for (item = head; item; item = item->next)
5329         {
5330                 if (strcmp(item->name, "restore_command") == 0)
5331                 {
5332                         recoveryRestoreCommand = pstrdup(item->value);
5333                         ereport(DEBUG2,
5334                                         (errmsg_internal("restore_command = '%s'",
5335                                                                          recoveryRestoreCommand)));
5336                 }
5337                 else if (strcmp(item->name, "recovery_end_command") == 0)
5338                 {
5339                         recoveryEndCommand = pstrdup(item->value);
5340                         ereport(DEBUG2,
5341                                         (errmsg_internal("recovery_end_command = '%s'",
5342                                                                          recoveryEndCommand)));
5343                 }
5344                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5345                 {
5346                         archiveCleanupCommand = pstrdup(item->value);
5347                         ereport(DEBUG2,
5348                                         (errmsg_internal("archive_cleanup_command = '%s'",
5349                                                                          archiveCleanupCommand)));
5350                 }
5351                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
5352                 {
5353                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
5354                                 ereport(ERROR,
5355                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5356                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
5357                         ereport(DEBUG2,
5358                                         (errmsg_internal("pause_at_recovery_target = '%s'",
5359                                                                          item->value)));
5360                 }
5361                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5362                 {
5363                         rtliGiven = true;
5364                         if (strcmp(item->value, "latest") == 0)
5365                                 rtli = 0;
5366                         else
5367                         {
5368                                 errno = 0;
5369                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5370                                 if (errno == EINVAL || errno == ERANGE)
5371                                         ereport(FATAL,
5372                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5373                                                                         item->value)));
5374                         }
5375                         if (rtli)
5376                                 ereport(DEBUG2,
5377                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
5378                         else
5379                                 ereport(DEBUG2,
5380                                          (errmsg_internal("recovery_target_timeline = latest")));
5381                 }
5382                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5383                 {
5384                         errno = 0;
5385                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5386                         if (errno == EINVAL || errno == ERANGE)
5387                                 ereport(FATAL,
5388                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5389                                                  item->value)));
5390                         ereport(DEBUG2,
5391                                         (errmsg_internal("recovery_target_xid = %u",
5392                                                                          recoveryTargetXid)));
5393                         recoveryTarget = RECOVERY_TARGET_XID;
5394                 }
5395                 else if (strcmp(item->name, "recovery_target_time") == 0)
5396                 {
5397                         /*
5398                          * if recovery_target_xid or recovery_target_name specified, then
5399                          * this overrides recovery_target_time
5400                          */
5401                         if (recoveryTarget == RECOVERY_TARGET_XID ||
5402                                 recoveryTarget == RECOVERY_TARGET_NAME)
5403                                 continue;
5404                         recoveryTarget = RECOVERY_TARGET_TIME;
5405
5406                         /*
5407                          * Convert the time string given by the user to TimestampTz form.
5408                          */
5409                         recoveryTargetTime =
5410                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5411                                                                                                 CStringGetDatum(item->value),
5412                                                                                                 ObjectIdGetDatum(InvalidOid),
5413                                                                                                                 Int32GetDatum(-1)));
5414                         ereport(DEBUG2,
5415                                         (errmsg_internal("recovery_target_time = '%s'",
5416                                                                    timestamptz_to_str(recoveryTargetTime))));
5417                 }
5418                 else if (strcmp(item->name, "recovery_target_name") == 0)
5419                 {
5420                         /*
5421                          * if recovery_target_xid specified, then this overrides
5422                          * recovery_target_name
5423                          */
5424                         if (recoveryTarget == RECOVERY_TARGET_XID)
5425                                 continue;
5426                         recoveryTarget = RECOVERY_TARGET_NAME;
5427
5428                         recoveryTargetName = pstrdup(item->value);
5429                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5430                                 ereport(FATAL,
5431                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5432                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5433                                                                 MAXFNAMELEN - 1)));
5434
5435                         ereport(DEBUG2,
5436                                         (errmsg_internal("recovery_target_name = '%s'",
5437                                                                          recoveryTargetName)));
5438                 }
5439                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5440                 {
5441                         /*
5442                          * does nothing if a recovery_target is not also set
5443                          */
5444                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5445                                 ereport(ERROR,
5446                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5447                                                  errmsg("parameter \"%s\" requires a Boolean value",
5448                                                                 "recovery_target_inclusive")));
5449                         ereport(DEBUG2,
5450                                         (errmsg_internal("recovery_target_inclusive = %s",
5451                                                                          item->value)));
5452                 }
5453                 else if (strcmp(item->name, "standby_mode") == 0)
5454                 {
5455                         if (!parse_bool(item->value, &StandbyModeRequested))
5456                                 ereport(ERROR,
5457                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5458                                                  errmsg("parameter \"%s\" requires a Boolean value",
5459                                                                 "standby_mode")));
5460                         ereport(DEBUG2,
5461                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5462                 }
5463                 else if (strcmp(item->name, "primary_conninfo") == 0)
5464                 {
5465                         PrimaryConnInfo = pstrdup(item->value);
5466                         ereport(DEBUG2,
5467                                         (errmsg_internal("primary_conninfo = '%s'",
5468                                                                          PrimaryConnInfo)));
5469                 }
5470                 else if (strcmp(item->name, "trigger_file") == 0)
5471                 {
5472                         TriggerFile = pstrdup(item->value);
5473                         ereport(DEBUG2,
5474                                         (errmsg_internal("trigger_file = '%s'",
5475                                                                          TriggerFile)));
5476                 }
5477                 else
5478                         ereport(FATAL,
5479                                         (errmsg("unrecognized recovery parameter \"%s\"",
5480                                                         item->name)));
5481         }
5482
5483         /*
5484          * Check for compulsory parameters
5485          */
5486         if (StandbyModeRequested)
5487         {
5488                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5489                         ereport(WARNING,
5490                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5491                                                         RECOVERY_COMMAND_FILE),
5492                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5493         }
5494         else
5495         {
5496                 if (recoveryRestoreCommand == NULL)
5497                         ereport(FATAL,
5498                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5499                                                         RECOVERY_COMMAND_FILE)));
5500         }
5501
5502         /* Enable fetching from archive recovery area */
5503         ArchiveRecoveryRequested = true;
5504
5505         /*
5506          * If user specified recovery_target_timeline, validate it or compute the
5507          * "latest" value.      We can't do this until after we've gotten the restore
5508          * command and set InArchiveRecovery, because we need to fetch timeline
5509          * history files from the archive.
5510          */
5511         if (rtliGiven)
5512         {
5513                 if (rtli)
5514                 {
5515                         /* Timeline 1 does not have a history file, all else should */
5516                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5517                                 ereport(FATAL,
5518                                                 (errmsg("recovery target timeline %u does not exist",
5519                                                                 rtli)));
5520                         recoveryTargetTLI = rtli;
5521                         recoveryTargetIsLatest = false;
5522                 }
5523                 else
5524                 {
5525                         /* We start the "latest" search from pg_control's timeline */
5526                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5527                         recoveryTargetIsLatest = true;
5528                 }
5529         }
5530
5531         FreeConfigVariables(head);
5532 }
5533
5534 /*
5535  * Exit archive-recovery state
5536  */
5537 static void
5538 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
5539 {
5540         char            recoveryPath[MAXPGPATH];
5541         char            xlogpath[MAXPGPATH];
5542
5543         /*
5544          * We are no longer in archive recovery state.
5545          */
5546         InArchiveRecovery = false;
5547
5548         /*
5549          * Update min recovery point one last time.
5550          */
5551         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5552
5553         /*
5554          * If the ending log segment is still open, close it (to avoid problems on
5555          * Windows with trying to rename or delete an open file).
5556          */
5557         if (readFile >= 0)
5558         {
5559                 close(readFile);
5560                 readFile = -1;
5561         }
5562
5563         /*
5564          * If we are establishing a new timeline, we have to copy data from the
5565          * last WAL segment of the old timeline to create a starting WAL segment
5566          * for the new timeline.
5567          *
5568          * Notify the archiver that the last WAL segment of the old timeline is
5569          * ready to copy to archival storage. Otherwise, it is not archived for a
5570          * while.
5571          */
5572         if (endTLI != ThisTimeLineID)
5573         {
5574                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
5575
5576                 if (XLogArchivingActive())
5577                 {
5578                         XLogFileName(xlogpath, endTLI, endLogSegNo);
5579                         XLogArchiveNotify(xlogpath);
5580                 }
5581         }
5582
5583         /*
5584          * Let's just make real sure there are not .ready or .done flags posted
5585          * for the new segment.
5586          */
5587         XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
5588         XLogArchiveCleanup(xlogpath);
5589
5590         /*
5591          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5592          * of it.
5593          */
5594         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5595         unlink(recoveryPath);           /* ignore any error */
5596
5597         /* Get rid of any remaining recovered timeline-history file, too */
5598         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5599         unlink(recoveryPath);           /* ignore any error */
5600
5601         /*
5602          * Rename the config file out of the way, so that we don't accidentally
5603          * re-enter archive recovery mode in a subsequent crash.
5604          */
5605         unlink(RECOVERY_COMMAND_DONE);
5606         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5607                 ereport(FATAL,
5608                                 (errcode_for_file_access(),
5609                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5610                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5611
5612         ereport(LOG,
5613                         (errmsg("archive recovery complete")));
5614 }
5615
5616 /*
5617  * For point-in-time recovery, this function decides whether we want to
5618  * stop applying the XLOG at or after the current record.
5619  *
5620  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
5621  * *includeThis is set TRUE if we should apply this record before stopping.
5622  *
5623  * We also track the timestamp of the latest applied COMMIT/ABORT
5624  * record in XLogCtl->recoveryLastXTime, for logging purposes.
5625  * Also, some information is saved in recoveryStopXid et al for use in
5626  * annotating the new timeline's history file.
5627  */
5628 static bool
5629 recoveryStopsHere(XLogRecord *record, bool *includeThis)
5630 {
5631         bool            stopsHere;
5632         uint8           record_info;
5633         TimestampTz recordXtime;
5634         char            recordRPName[MAXFNAMELEN];
5635
5636         /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
5637         if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
5638                 return false;
5639         record_info = record->xl_info & ~XLR_INFO_MASK;
5640         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5641         {
5642                 xl_xact_commit_compact *recordXactCommitData;
5643
5644                 recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
5645                 recordXtime = recordXactCommitData->xact_time;
5646         }
5647         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5648         {
5649                 xl_xact_commit *recordXactCommitData;
5650
5651                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
5652                 recordXtime = recordXactCommitData->xact_time;
5653         }
5654         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5655         {
5656                 xl_xact_abort *recordXactAbortData;
5657
5658                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
5659                 recordXtime = recordXactAbortData->xact_time;
5660         }
5661         else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5662         {
5663                 xl_restore_point *recordRestorePointData;
5664
5665                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5666                 recordXtime = recordRestorePointData->rp_time;
5667                 strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
5668         }
5669         else
5670                 return false;
5671
5672         /* Do we have a PITR target at all? */
5673         if (recoveryTarget == RECOVERY_TARGET_UNSET)
5674         {
5675                 /*
5676                  * Save timestamp of latest transaction commit/abort if this is a
5677                  * transaction record
5678                  */
5679                 if (record->xl_rmid == RM_XACT_ID)
5680                         SetLatestXTime(recordXtime);
5681                 return false;
5682         }
5683
5684         if (recoveryTarget == RECOVERY_TARGET_XID)
5685         {
5686                 /*
5687                  * There can be only one transaction end record with this exact
5688                  * transactionid
5689                  *
5690                  * when testing for an xid, we MUST test for equality only, since
5691                  * transactions are numbered in the order they start, not the order
5692                  * they complete. A higher numbered xid will complete before you about
5693                  * 50% of the time...
5694                  */
5695                 stopsHere = (record->xl_xid == recoveryTargetXid);
5696                 if (stopsHere)
5697                         *includeThis = recoveryTargetInclusive;
5698         }
5699         else if (recoveryTarget == RECOVERY_TARGET_NAME)
5700         {
5701                 /*
5702                  * There can be many restore points that share the same name, so we
5703                  * stop at the first one
5704                  */
5705                 stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
5706
5707                 /*
5708                  * Ignore recoveryTargetInclusive because this is not a transaction
5709                  * record
5710                  */
5711                 *includeThis = false;
5712         }
5713         else
5714         {
5715                 /*
5716                  * There can be many transactions that share the same commit time, so
5717                  * we stop after the last one, if we are inclusive, or stop at the
5718                  * first one if we are exclusive
5719                  */
5720                 if (recoveryTargetInclusive)
5721                         stopsHere = (recordXtime > recoveryTargetTime);
5722                 else
5723                         stopsHere = (recordXtime >= recoveryTargetTime);
5724                 if (stopsHere)
5725                         *includeThis = false;
5726         }
5727
5728         if (stopsHere)
5729         {
5730                 recoveryStopXid = record->xl_xid;
5731                 recoveryStopTime = recordXtime;
5732                 recoveryStopAfter = *includeThis;
5733
5734                 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5735                 {
5736                         if (recoveryStopAfter)
5737                                 ereport(LOG,
5738                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5739                                                                 recoveryStopXid,
5740                                                                 timestamptz_to_str(recoveryStopTime))));
5741                         else
5742                                 ereport(LOG,
5743                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
5744                                                                 recoveryStopXid,
5745                                                                 timestamptz_to_str(recoveryStopTime))));
5746                 }
5747                 else if (record_info == XLOG_XACT_ABORT)
5748                 {
5749                         if (recoveryStopAfter)
5750                                 ereport(LOG,
5751                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5752                                                                 recoveryStopXid,
5753                                                                 timestamptz_to_str(recoveryStopTime))));
5754                         else
5755                                 ereport(LOG,
5756                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
5757                                                                 recoveryStopXid,
5758                                                                 timestamptz_to_str(recoveryStopTime))));
5759                 }
5760                 else
5761                 {
5762                         strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
5763
5764                         ereport(LOG,
5765                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5766                                                 recoveryStopName,
5767                                                 timestamptz_to_str(recoveryStopTime))));
5768                 }
5769
5770                 /*
5771                  * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
5772                  * restore point since they are timestamped, though the latest
5773                  * transaction time is not updated.
5774                  */
5775                 if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
5776                         SetLatestXTime(recordXtime);
5777         }
5778         else if (record->xl_rmid == RM_XACT_ID)
5779                 SetLatestXTime(recordXtime);
5780
5781         return stopsHere;
5782 }
5783
5784 /*
5785  * Wait until shared recoveryPause flag is cleared.
5786  *
5787  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5788  * Probably not worth the trouble though.  This state shouldn't be one that
5789  * anyone cares about server power consumption in.
5790  */
5791 static void
5792 recoveryPausesHere(void)
5793 {
5794         /* Don't pause unless users can connect! */
5795         if (!LocalHotStandbyActive)
5796                 return;
5797
5798         ereport(LOG,
5799                         (errmsg("recovery has paused"),
5800                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5801
5802         while (RecoveryIsPaused())
5803         {
5804                 pg_usleep(1000000L);    /* 1000 ms */
5805                 HandleStartupProcInterrupts();
5806         }
5807 }
5808
5809 bool
5810 RecoveryIsPaused(void)
5811 {
5812         /* use volatile pointer to prevent code rearrangement */
5813         volatile XLogCtlData *xlogctl = XLogCtl;
5814         bool            recoveryPause;
5815
5816         SpinLockAcquire(&xlogctl->info_lck);
5817         recoveryPause = xlogctl->recoveryPause;
5818         SpinLockRelease(&xlogctl->info_lck);
5819
5820         return recoveryPause;
5821 }
5822
5823 void
5824 SetRecoveryPause(bool recoveryPause)
5825 {
5826         /* use volatile pointer to prevent code rearrangement */
5827         volatile XLogCtlData *xlogctl = XLogCtl;
5828
5829         SpinLockAcquire(&xlogctl->info_lck);
5830         xlogctl->recoveryPause = recoveryPause;
5831         SpinLockRelease(&xlogctl->info_lck);
5832 }
5833
5834 /*
5835  * Save timestamp of latest processed commit/abort record.
5836  *
5837  * We keep this in XLogCtl, not a simple static variable, so that it can be
5838  * seen by processes other than the startup process.  Note in particular
5839  * that CreateRestartPoint is executed in the checkpointer.
5840  */
5841 static void
5842 SetLatestXTime(TimestampTz xtime)
5843 {
5844         /* use volatile pointer to prevent code rearrangement */
5845         volatile XLogCtlData *xlogctl = XLogCtl;
5846
5847         SpinLockAcquire(&xlogctl->info_lck);
5848         xlogctl->recoveryLastXTime = xtime;
5849         SpinLockRelease(&xlogctl->info_lck);
5850 }
5851
5852 /*
5853  * Fetch timestamp of latest processed commit/abort record.
5854  */
5855 TimestampTz
5856 GetLatestXTime(void)
5857 {
5858         /* use volatile pointer to prevent code rearrangement */
5859         volatile XLogCtlData *xlogctl = XLogCtl;
5860         TimestampTz xtime;
5861
5862         SpinLockAcquire(&xlogctl->info_lck);
5863         xtime = xlogctl->recoveryLastXTime;
5864         SpinLockRelease(&xlogctl->info_lck);
5865
5866         return xtime;
5867 }
5868
5869 /*
5870  * Save timestamp of the next chunk of WAL records to apply.
5871  *
5872  * We keep this in XLogCtl, not a simple static variable, so that it can be
5873  * seen by all backends.
5874  */
5875 static void
5876 SetCurrentChunkStartTime(TimestampTz xtime)
5877 {
5878         /* use volatile pointer to prevent code rearrangement */
5879         volatile XLogCtlData *xlogctl = XLogCtl;
5880
5881         SpinLockAcquire(&xlogctl->info_lck);
5882         xlogctl->currentChunkStartTime = xtime;
5883         SpinLockRelease(&xlogctl->info_lck);
5884 }
5885
5886 /*
5887  * Fetch timestamp of latest processed commit/abort record.
5888  * Startup process maintains an accurate local copy in XLogReceiptTime
5889  */
5890 TimestampTz
5891 GetCurrentChunkReplayStartTime(void)
5892 {
5893         /* use volatile pointer to prevent code rearrangement */
5894         volatile XLogCtlData *xlogctl = XLogCtl;
5895         TimestampTz xtime;
5896
5897         SpinLockAcquire(&xlogctl->info_lck);
5898         xtime = xlogctl->currentChunkStartTime;
5899         SpinLockRelease(&xlogctl->info_lck);
5900
5901         return xtime;
5902 }
5903
5904 /*
5905  * Returns time of receipt of current chunk of XLOG data, as well as
5906  * whether it was received from streaming replication or from archives.
5907  */
5908 void
5909 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
5910 {
5911         /*
5912          * This must be executed in the startup process, since we don't export the
5913          * relevant state to shared memory.
5914          */
5915         Assert(InRecovery);
5916
5917         *rtime = XLogReceiptTime;
5918         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
5919 }
5920
5921 /*
5922  * Note that text field supplied is a parameter name and does not require
5923  * translation
5924  */
5925 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5926 do { \
5927         if ((currValue) < (minValue)) \
5928                 ereport(ERROR, \
5929                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5930                                  errmsg("hot standby is not possible because " \
5931                                                 "%s = %d is a lower setting than on the master server " \
5932                                                 "(its value was %d)", \
5933                                                 param_name, \
5934                                                 currValue, \
5935                                                 minValue))); \
5936 } while(0)
5937
5938 /*
5939  * Check to see if required parameters are set high enough on this server
5940  * for various aspects of recovery operation.
5941  */
5942 static void
5943 CheckRequiredParameterValues(void)
5944 {
5945         /*
5946          * For archive recovery, the WAL must be generated with at least 'archive'
5947          * wal_level.
5948          */
5949         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5950         {
5951                 ereport(WARNING,
5952                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
5953                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5954         }
5955
5956         /*
5957          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
5958          * we must have at least as many backend slots as the primary.
5959          */
5960         if (InArchiveRecovery && EnableHotStandby)
5961         {
5962                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
5963                         ereport(ERROR,
5964                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
5965                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5966
5967                 /* We ignore autovacuum_max_workers when we make this test. */
5968                 RecoveryRequiresIntParameter("max_connections",
5969                                                                          MaxConnections,
5970                                                                          ControlFile->MaxConnections);
5971                 RecoveryRequiresIntParameter("max_worker_processes",
5972                                                                          max_worker_processes,
5973                                                                          ControlFile->max_worker_processes);
5974                 RecoveryRequiresIntParameter("max_prepared_transactions",
5975                                                                          max_prepared_xacts,
5976                                                                          ControlFile->max_prepared_xacts);
5977                 RecoveryRequiresIntParameter("max_locks_per_transaction",
5978                                                                          max_locks_per_xact,
5979                                                                          ControlFile->max_locks_per_xact);
5980         }
5981 }
5982
5983 /*
5984  * This must be called ONCE during postmaster or standalone-backend startup
5985  */
5986 void
5987 StartupXLOG(void)
5988 {
5989         XLogCtlInsert *Insert;
5990         CheckPoint      checkPoint;
5991         bool            wasShutdown;
5992         bool            reachedStopPoint = false;
5993         bool            haveBackupLabel = false;
5994         XLogRecPtr      RecPtr,
5995                                 checkPointLoc,
5996                                 EndOfLog;
5997         XLogSegNo       endLogSegNo;
5998         TimeLineID      PrevTimeLineID;
5999         XLogRecord *record;
6000         TransactionId oldestActiveXID;
6001         bool            backupEndRequired = false;
6002         bool            backupFromStandby = false;
6003         DBState         dbstate_at_startup;
6004         XLogReaderState *xlogreader;
6005         XLogPageReadPrivate private;
6006         bool            fast_promoted = false;
6007
6008         /*
6009          * Read control file and check XLOG status looks valid.
6010          *
6011          * Note: in most control paths, *ControlFile is already valid and we need
6012          * not do ReadControlFile() here, but might as well do it to be sure.
6013          */
6014         ReadControlFile();
6015
6016         if (ControlFile->state < DB_SHUTDOWNED ||
6017                 ControlFile->state > DB_IN_PRODUCTION ||
6018                 !XRecOffIsValid(ControlFile->checkPoint))
6019                 ereport(FATAL,
6020                                 (errmsg("control file contains invalid data")));
6021
6022         if (ControlFile->state == DB_SHUTDOWNED)
6023         {
6024                 /* This is the expected case, so don't be chatty in standalone mode */
6025                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6026                                 (errmsg("database system was shut down at %s",
6027                                                 str_time(ControlFile->time))));
6028         }
6029         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6030                 ereport(LOG,
6031                                 (errmsg("database system was shut down in recovery at %s",
6032                                                 str_time(ControlFile->time))));
6033         else if (ControlFile->state == DB_SHUTDOWNING)
6034                 ereport(LOG,
6035                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6036                                                 str_time(ControlFile->time))));
6037         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6038                 ereport(LOG,
6039                    (errmsg("database system was interrupted while in recovery at %s",
6040                                    str_time(ControlFile->time)),
6041                         errhint("This probably means that some data is corrupted and"
6042                                         " you will have to use the last backup for recovery.")));
6043         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6044                 ereport(LOG,
6045                                 (errmsg("database system was interrupted while in recovery at log time %s",
6046                                                 str_time(ControlFile->checkPointCopy.time)),
6047                                  errhint("If this has occurred more than once some data might be corrupted"
6048                           " and you might need to choose an earlier recovery target.")));
6049         else if (ControlFile->state == DB_IN_PRODUCTION)
6050                 ereport(LOG,
6051                           (errmsg("database system was interrupted; last known up at %s",
6052                                           str_time(ControlFile->time))));
6053
6054         /* This is just to allow attaching to startup process with a debugger */
6055 #ifdef XLOG_REPLAY_DELAY
6056         if (ControlFile->state != DB_SHUTDOWNED)
6057                 pg_usleep(60000000L);
6058 #endif
6059
6060         /*
6061          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
6062          * someone has performed a copy for PITR, these directories may have been
6063          * excluded and need to be re-created.
6064          */
6065         ValidateXLOGDirectoryStructure();
6066
6067         /*
6068          * Clear out any old relcache cache files.      This is *necessary* if we do
6069          * any WAL replay, since that would probably result in the cache files
6070          * being out of sync with database reality.  In theory we could leave them
6071          * in place if the database had been cleanly shut down, but it seems
6072          * safest to just remove them always and let them be rebuilt during the
6073          * first backend startup.
6074          */
6075         RelationCacheInitFileRemove();
6076
6077         /*
6078          * Initialize on the assumption we want to recover to the latest timeline
6079          * that's active according to pg_control.
6080          */
6081         if (ControlFile->minRecoveryPointTLI >
6082                 ControlFile->checkPointCopy.ThisTimeLineID)
6083                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6084         else
6085                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6086
6087         /*
6088          * Check for recovery control file, and if so set up state for offline
6089          * recovery
6090          */
6091         readRecoveryCommandFile();
6092
6093         /*
6094          * Save archive_cleanup_command in shared memory so that other processes
6095          * can see it.
6096          */
6097         strncpy(XLogCtl->archiveCleanupCommand,
6098                         archiveCleanupCommand ? archiveCleanupCommand : "",
6099                         sizeof(XLogCtl->archiveCleanupCommand));
6100
6101         if (ArchiveRecoveryRequested)
6102         {
6103                 if (StandbyModeRequested)
6104                         ereport(LOG,
6105                                         (errmsg("entering standby mode")));
6106                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6107                         ereport(LOG,
6108                                         (errmsg("starting point-in-time recovery to XID %u",
6109                                                         recoveryTargetXid)));
6110                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6111                         ereport(LOG,
6112                                         (errmsg("starting point-in-time recovery to %s",
6113                                                         timestamptz_to_str(recoveryTargetTime))));
6114                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6115                         ereport(LOG,
6116                                         (errmsg("starting point-in-time recovery to \"%s\"",
6117                                                         recoveryTargetName)));
6118                 else
6119                         ereport(LOG,
6120                                         (errmsg("starting archive recovery")));
6121         }
6122
6123         /*
6124          * Take ownership of the wakeup latch if we're going to sleep during
6125          * recovery.
6126          */
6127         if (StandbyModeRequested)
6128                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6129
6130         /* Set up XLOG reader facility */
6131         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6132         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6133         if (!xlogreader)
6134                 ereport(ERROR,
6135                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6136                                  errmsg("out of memory"),
6137                         errdetail("Failed while allocating an XLog reading processor.")));
6138         xlogreader->system_identifier = ControlFile->system_identifier;
6139
6140         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6141                                                   &backupFromStandby))
6142         {
6143                 /*
6144                  * Archive recovery was requested, and thanks to the backup label
6145                  * file, we know how far we need to replay to reach consistency. Enter
6146                  * archive recovery directly.
6147                  */
6148                 InArchiveRecovery = true;
6149                 if (StandbyModeRequested)
6150                         StandbyMode = true;
6151
6152                 /*
6153                  * When a backup_label file is present, we want to roll forward from
6154                  * the checkpoint it identifies, rather than using pg_control.
6155                  */
6156                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6157                 if (record != NULL)
6158                 {
6159                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6160                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6161                         ereport(DEBUG1,
6162                                         (errmsg("checkpoint record is at %X/%X",
6163                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6164                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6165
6166                         /*
6167                          * Make sure that REDO location exists. This may not be the case
6168                          * if there was a crash during an online backup, which left a
6169                          * backup_label around that references a WAL segment that's
6170                          * already been archived.
6171                          */
6172                         if (checkPoint.redo < checkPointLoc)
6173                         {
6174                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6175                                         ereport(FATAL,
6176                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6177                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6178                         }
6179                 }
6180                 else
6181                 {
6182                         ereport(FATAL,
6183                                         (errmsg("could not locate required checkpoint record"),
6184                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6185                         wasShutdown = false;    /* keep compiler quiet */
6186                 }
6187                 /* set flag to delete it later */
6188                 haveBackupLabel = true;
6189         }
6190         else
6191         {
6192                 /*
6193                  * It's possible that archive recovery was requested, but we don't
6194                  * know how far we need to replay the WAL before we reach consistency.
6195                  * This can happen for example if a base backup is taken from a
6196                  * running server using an atomic filesystem snapshot, without calling
6197                  * pg_start/stop_backup. Or if you just kill a running master server
6198                  * and put it into archive recovery by creating a recovery.conf file.
6199                  *
6200                  * Our strategy in that case is to perform crash recovery first,
6201                  * replaying all the WAL present in pg_xlog, and only enter archive
6202                  * recovery after that.
6203                  *
6204                  * But usually we already know how far we need to replay the WAL (up
6205                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6206                  * end-of-backup record), and we can enter archive recovery directly.
6207                  */
6208                 if (ArchiveRecoveryRequested &&
6209                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6210                          ControlFile->backupEndRequired ||
6211                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6212                          ControlFile->state == DB_SHUTDOWNED))
6213                 {
6214                         InArchiveRecovery = true;
6215                         if (StandbyModeRequested)
6216                                 StandbyMode = true;
6217                 }
6218
6219                 /*
6220                  * Get the last valid checkpoint record.  If the latest one according
6221                  * to pg_control is broken, try the next-to-last one.
6222                  */
6223                 checkPointLoc = ControlFile->checkPoint;
6224                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6225                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6226                 if (record != NULL)
6227                 {
6228                         ereport(DEBUG1,
6229                                         (errmsg("checkpoint record is at %X/%X",
6230                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6231                 }
6232                 else if (StandbyMode)
6233                 {
6234                         /*
6235                          * The last valid checkpoint record required for a streaming
6236                          * recovery exists in neither standby nor the primary.
6237                          */
6238                         ereport(PANIC,
6239                                         (errmsg("could not locate a valid checkpoint record")));
6240                 }
6241                 else
6242                 {
6243                         checkPointLoc = ControlFile->prevCheckPoint;
6244                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6245                         if (record != NULL)
6246                         {
6247                                 ereport(LOG,
6248                                                 (errmsg("using previous checkpoint record at %X/%X",
6249                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6250                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6251                         }
6252                         else
6253                                 ereport(PANIC,
6254                                          (errmsg("could not locate a valid checkpoint record")));
6255                 }
6256                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6257                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6258         }
6259
6260         /*
6261          * If the location of the checkpoint record is not on the expected
6262          * timeline in the history of the requested timeline, we cannot proceed:
6263          * the backup is not part of the history of the requested timeline.
6264          */
6265         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6266                                                                  * record */
6267         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6268                 checkPoint.ThisTimeLineID)
6269         {
6270                 XLogRecPtr      switchpoint;
6271
6272                 /*
6273                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6274                  * not in expectedTLEs at all.
6275                  */
6276                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6277                 ereport(FATAL,
6278                                 (errmsg("requested timeline %u is not a child of this server's history",
6279                                                 recoveryTargetTLI),
6280                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6281                                                    (uint32) (ControlFile->checkPoint >> 32),
6282                                                    (uint32) ControlFile->checkPoint,
6283                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6284                                                    (uint32) (switchpoint >> 32),
6285                                                    (uint32) switchpoint)));
6286         }
6287
6288         /*
6289          * The min recovery point should be part of the requested timeline's
6290          * history, too.
6291          */
6292         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6293           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6294                 ControlFile->minRecoveryPointTLI)
6295                 ereport(FATAL,
6296                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6297                                                 recoveryTargetTLI,
6298                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6299                                                 (uint32) ControlFile->minRecoveryPoint,
6300                                                 ControlFile->minRecoveryPointTLI)));
6301
6302         LastRec = RecPtr = checkPointLoc;
6303
6304         ereport(DEBUG1,
6305                         (errmsg("redo record is at %X/%X; shutdown %s",
6306                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6307                                         wasShutdown ? "TRUE" : "FALSE")));
6308         ereport(DEBUG1,
6309                         (errmsg("next transaction ID: %u/%u; next OID: %u",
6310                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
6311                                         checkPoint.nextOid)));
6312         ereport(DEBUG1,
6313                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6314                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6315         ereport(DEBUG1,
6316                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6317                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
6318         ereport(DEBUG1,
6319                         (errmsg("oldest MultiXactId: %u, in database %u",
6320                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6321         if (!TransactionIdIsNormal(checkPoint.nextXid))
6322                 ereport(PANIC,
6323                                 (errmsg("invalid next transaction ID")));
6324
6325         /* initialize shared memory variables from the checkpoint record */
6326         ShmemVariableCache->nextXid = checkPoint.nextXid;
6327         ShmemVariableCache->nextOid = checkPoint.nextOid;
6328         ShmemVariableCache->oidCount = 0;
6329         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6330         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6331         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
6332         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6333         XLogCtl->ckptXid = checkPoint.nextXid;
6334
6335         /*
6336          * Startup MultiXact.  We need to do this early for two reasons: one
6337          * is that we might try to access multixacts when we do tuple freezing,
6338          * and the other is we need its state initialized because we attempt
6339          * truncation during restartpoints.
6340          */
6341         StartupMultiXact();
6342
6343         /*
6344          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6345          * control file. On recovery, all unlogged relations are blown away, so
6346          * the unlogged LSN counter can be reset too.
6347          */
6348         if (ControlFile->state == DB_SHUTDOWNED)
6349                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6350         else
6351                 XLogCtl->unloggedLSN = 1;
6352
6353         /*
6354          * We must replay WAL entries using the same TimeLineID they were created
6355          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6356          * also xlog_redo()).
6357          */
6358         ThisTimeLineID = checkPoint.ThisTimeLineID;
6359
6360         /*
6361          * Copy any missing timeline history files between 'now' and the recovery
6362          * target timeline from archive to pg_xlog. While we don't need those
6363          * files ourselves - the history file of the recovery target timeline
6364          * covers all the previous timelines in the history too - a cascading
6365          * standby server might be interested in them. Or, if you archive the WAL
6366          * from this server to a different archive than the master, it'd be good
6367          * for all the history files to get archived there after failover, so that
6368          * you can use one of the old timelines as a PITR target. Timeline history
6369          * files are small, so it's better to copy them unnecessarily than not
6370          * copy them and regret later.
6371          */
6372         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6373
6374         lastFullPageWrites = checkPoint.fullPageWrites;
6375
6376         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6377
6378         if (RecPtr < checkPoint.redo)
6379                 ereport(PANIC,
6380                                 (errmsg("invalid redo in checkpoint record")));
6381
6382         /*
6383          * Check whether we need to force recovery from WAL.  If it appears to
6384          * have been a clean shutdown and we did not have a recovery.conf file,
6385          * then assume no recovery needed.
6386          */
6387         if (checkPoint.redo < RecPtr)
6388         {
6389                 if (wasShutdown)
6390                         ereport(PANIC,
6391                                         (errmsg("invalid redo record in shutdown checkpoint")));
6392                 InRecovery = true;
6393         }
6394         else if (ControlFile->state != DB_SHUTDOWNED)
6395                 InRecovery = true;
6396         else if (ArchiveRecoveryRequested)
6397         {
6398                 /* force recovery due to presence of recovery.conf */
6399                 InRecovery = true;
6400         }
6401
6402         /* REDO */
6403         if (InRecovery)
6404         {
6405                 int                     rmid;
6406
6407                 /* use volatile pointer to prevent code rearrangement */
6408                 volatile XLogCtlData *xlogctl = XLogCtl;
6409
6410                 /*
6411                  * Update pg_control to show that we are recovering and to show the
6412                  * selected checkpoint as the place we are starting from. We also mark
6413                  * pg_control with any minimum recovery stop point obtained from a
6414                  * backup history file.
6415                  */
6416                 dbstate_at_startup = ControlFile->state;
6417                 if (InArchiveRecovery)
6418                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6419                 else
6420                 {
6421                         ereport(LOG,
6422                                         (errmsg("database system was not properly shut down; "
6423                                                         "automatic recovery in progress")));
6424                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6425                                 ereport(LOG,
6426                                                 (errmsg("crash recovery starts in timeline %u "
6427                                                                 "and has target timeline %u",
6428                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6429                                                                 recoveryTargetTLI)));
6430                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6431                 }
6432                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6433                 ControlFile->checkPoint = checkPointLoc;
6434                 ControlFile->checkPointCopy = checkPoint;
6435                 if (InArchiveRecovery)
6436                 {
6437                         /* initialize minRecoveryPoint if not set yet */
6438                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6439                         {
6440                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6441                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6442                         }
6443                 }
6444
6445                 /*
6446                  * Set backupStartPoint if we're starting recovery from a base backup.
6447                  *
6448                  * Set backupEndPoint and use minRecoveryPoint as the backup end
6449                  * location if we're starting recovery from a base backup which was
6450                  * taken from the standby. In this case, the database system status in
6451                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
6452                  * means that backup is corrupted, so we cancel recovery.
6453                  */
6454                 if (haveBackupLabel)
6455                 {
6456                         ControlFile->backupStartPoint = checkPoint.redo;
6457                         ControlFile->backupEndRequired = backupEndRequired;
6458
6459                         if (backupFromStandby)
6460                         {
6461                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6462                                         ereport(FATAL,
6463                                                         (errmsg("backup_label contains data inconsistent with control file"),
6464                                                          errhint("This means that the backup is corrupted and you will "
6465                                                            "have to use another backup for recovery.")));
6466                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6467                         }
6468                 }
6469                 ControlFile->time = (pg_time_t) time(NULL);
6470                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6471                 UpdateControlFile();
6472
6473                 /* initialize our local copy of minRecoveryPoint */
6474                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6475                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6476
6477                 /*
6478                  * Reset pgstat data, because it may be invalid after recovery.
6479                  */
6480                 pgstat_reset_all();
6481
6482                 /*
6483                  * If there was a backup label file, it's done its job and the info
6484                  * has now been propagated into pg_control.  We must get rid of the
6485                  * label file so that if we crash during recovery, we'll pick up at
6486                  * the latest recovery restartpoint instead of going all the way back
6487                  * to the backup start point.  It seems prudent though to just rename
6488                  * the file out of the way rather than delete it completely.
6489                  */
6490                 if (haveBackupLabel)
6491                 {
6492                         unlink(BACKUP_LABEL_OLD);
6493                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6494                                 ereport(FATAL,
6495                                                 (errcode_for_file_access(),
6496                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6497                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6498                 }
6499
6500                 /* Check that the GUCs used to generate the WAL allow recovery */
6501                 CheckRequiredParameterValues();
6502
6503                 /*
6504                  * We're in recovery, so unlogged relations may be trashed and must be
6505                  * reset.  This should be done BEFORE allowing Hot Standby
6506                  * connections, so that read-only backends don't try to read whatever
6507                  * garbage is left over from before.
6508                  */
6509                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6510
6511                 /*
6512                  * Likewise, delete any saved transaction snapshot files that got left
6513                  * behind by crashed backends.
6514                  */
6515                 DeleteAllExportedSnapshotFiles();
6516
6517                 /*
6518                  * Initialize for Hot Standby, if enabled. We won't let backends in
6519                  * yet, not until we've reached the min recovery point specified in
6520                  * control file and we've established a recovery snapshot from a
6521                  * running-xacts WAL record.
6522                  */
6523                 if (ArchiveRecoveryRequested && EnableHotStandby)
6524                 {
6525                         TransactionId *xids;
6526                         int                     nxids;
6527
6528                         ereport(DEBUG1,
6529                                         (errmsg("initializing for hot standby")));
6530
6531                         InitRecoveryTransactionEnvironment();
6532
6533                         if (wasShutdown)
6534                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6535                         else
6536                                 oldestActiveXID = checkPoint.oldestActiveXid;
6537                         Assert(TransactionIdIsValid(oldestActiveXID));
6538
6539                         /* Tell procarray about the range of xids it has to deal with */
6540                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6541
6542                         /*
6543                          * Startup commit log and subtrans only. MultiXact has already
6544                          * been started up and other SLRUs are not maintained during
6545                          * recovery and need not be started yet.
6546                          */
6547                         StartupCLOG();
6548                         StartupSUBTRANS(oldestActiveXID);
6549
6550                         /*
6551                          * If we're beginning at a shutdown checkpoint, we know that
6552                          * nothing was running on the master at this point. So fake-up an
6553                          * empty running-xacts record and use that here and now. Recover
6554                          * additional standby state for prepared transactions.
6555                          */
6556                         if (wasShutdown)
6557                         {
6558                                 RunningTransactionsData running;
6559                                 TransactionId latestCompletedXid;
6560
6561                                 /*
6562                                  * Construct a RunningTransactions snapshot representing a
6563                                  * shut down server, with only prepared transactions still
6564                                  * alive. We're never overflowed at this point because all
6565                                  * subxids are listed with their parent prepared transactions.
6566                                  */
6567                                 running.xcnt = nxids;
6568                                 running.subxcnt = 0;
6569                                 running.subxid_overflow = false;
6570                                 running.nextXid = checkPoint.nextXid;
6571                                 running.oldestRunningXid = oldestActiveXID;
6572                                 latestCompletedXid = checkPoint.nextXid;
6573                                 TransactionIdRetreat(latestCompletedXid);
6574                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6575                                 running.latestCompletedXid = latestCompletedXid;
6576                                 running.xids = xids;
6577
6578                                 ProcArrayApplyRecoveryInfo(&running);
6579
6580                                 StandbyRecoverPreparedTransactions(false);
6581                         }
6582                 }
6583
6584                 /* Initialize resource managers */
6585                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6586                 {
6587                         if (RmgrTable[rmid].rm_startup != NULL)
6588                                 RmgrTable[rmid].rm_startup();
6589                 }
6590
6591                 /*
6592                  * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
6593                  * recoveryLastXTime.
6594                  *
6595                  * This is slightly confusing if we're starting from an online
6596                  * checkpoint; we've just read and replayed the checkpoint record, but
6597                  * we're going to start replay from its redo pointer, which precedes
6598                  * the location of the checkpoint record itself. So even though the
6599                  * last record we've replayed is indeed ReadRecPtr, we haven't
6600                  * replayed all the preceding records yet. That's OK for the current
6601                  * use of these variables.
6602                  */
6603                 SpinLockAcquire(&xlogctl->info_lck);
6604                 xlogctl->replayEndRecPtr = ReadRecPtr;
6605                 xlogctl->replayEndTLI = ThisTimeLineID;
6606                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
6607                 xlogctl->lastReplayedTLI = ThisTimeLineID;
6608                 xlogctl->recoveryLastXTime = 0;
6609                 xlogctl->currentChunkStartTime = 0;
6610                 xlogctl->recoveryPause = false;
6611                 SpinLockRelease(&xlogctl->info_lck);
6612
6613                 /* Also ensure XLogReceiptTime has a sane value */
6614                 XLogReceiptTime = GetCurrentTimestamp();
6615
6616                 /*
6617                  * Let postmaster know we've started redo now, so that it can launch
6618                  * checkpointer to perform restartpoints.  We don't bother during
6619                  * crash recovery as restartpoints can only be performed during
6620                  * archive recovery.  And we'd like to keep crash recovery simple, to
6621                  * avoid introducing bugs that could affect you when recovering after
6622                  * crash.
6623                  *
6624                  * After this point, we can no longer assume that we're the only
6625                  * process in addition to postmaster!  Also, fsync requests are
6626                  * subsequently to be handled by the checkpointer, not locally.
6627                  */
6628                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6629                 {
6630                         PublishStartupProcessInformation();
6631                         SetForwardFsyncRequests();
6632                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6633                         bgwriterLaunched = true;
6634                 }
6635
6636                 /*
6637                  * Allow read-only connections immediately if we're consistent
6638                  * already.
6639                  */
6640                 CheckRecoveryConsistency();
6641
6642                 /*
6643                  * Find the first record that logically follows the checkpoint --- it
6644                  * might physically precede it, though.
6645                  */
6646                 if (checkPoint.redo < RecPtr)
6647                 {
6648                         /* back up to find the record */
6649                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6650                 }
6651                 else
6652                 {
6653                         /* just have to read next record after CheckPoint */
6654                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6655                 }
6656
6657                 if (record != NULL)
6658                 {
6659                         bool            recoveryContinue = true;
6660                         bool            recoveryApply = true;
6661                         ErrorContextCallback errcallback;
6662                         TimestampTz xtime;
6663
6664                         InRedo = true;
6665
6666                         ereport(LOG,
6667                                         (errmsg("redo starts at %X/%X",
6668                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6669
6670                         /*
6671                          * main redo apply loop
6672                          */
6673                         do
6674                         {
6675                                 bool            switchedTLI = false;
6676
6677 #ifdef WAL_DEBUG
6678                                 if (XLOG_DEBUG ||
6679                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6680                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6681                                 {
6682                                         StringInfoData buf;
6683
6684                                         initStringInfo(&buf);
6685                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6686                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6687                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6688                                         xlog_outrec(&buf, record);
6689                                         appendStringInfoString(&buf, " - ");
6690                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
6691                                                                                                            record->xl_info,
6692                                                                                                          XLogRecGetData(record));
6693                                         elog(LOG, "%s", buf.data);
6694                                         pfree(buf.data);
6695                                 }
6696 #endif
6697
6698                                 /* Handle interrupt signals of startup process */
6699                                 HandleStartupProcInterrupts();
6700
6701                                 /*
6702                                  * Pause WAL replay, if requested by a hot-standby session via
6703                                  * SetRecoveryPause().
6704                                  *
6705                                  * Note that we intentionally don't take the info_lck spinlock
6706                                  * here.  We might therefore read a slightly stale value of
6707                                  * the recoveryPause flag, but it can't be very stale (no
6708                                  * worse than the last spinlock we did acquire).  Since a
6709                                  * pause request is a pretty asynchronous thing anyway,
6710                                  * possibly responding to it one WAL record later than we
6711                                  * otherwise would is a minor issue, so it doesn't seem worth
6712                                  * adding another spinlock cycle to prevent that.
6713                                  */
6714                                 if (xlogctl->recoveryPause)
6715                                         recoveryPausesHere();
6716
6717                                 /*
6718                                  * Have we reached our recovery target?
6719                                  */
6720                                 if (recoveryStopsHere(record, &recoveryApply))
6721                                 {
6722                                         if (recoveryPauseAtTarget)
6723                                         {
6724                                                 SetRecoveryPause(true);
6725                                                 recoveryPausesHere();
6726                                         }
6727                                         reachedStopPoint = true;        /* see below */
6728                                         recoveryContinue = false;
6729
6730                                         /* Exit loop if we reached non-inclusive recovery target */
6731                                         if (!recoveryApply)
6732                                                 break;
6733                                 }
6734
6735                                 /* Setup error traceback support for ereport() */
6736                                 errcallback.callback = rm_redo_error_callback;
6737                                 errcallback.arg = (void *) record;
6738                                 errcallback.previous = error_context_stack;
6739                                 error_context_stack = &errcallback;
6740
6741                                 /*
6742                                  * ShmemVariableCache->nextXid must be beyond record's xid.
6743                                  *
6744                                  * We don't expect anyone else to modify nextXid, hence we
6745                                  * don't need to hold a lock while examining it.  We still
6746                                  * acquire the lock to modify it, though.
6747                                  */
6748                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6749                                                                                                  ShmemVariableCache->nextXid))
6750                                 {
6751                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6752                                         ShmemVariableCache->nextXid = record->xl_xid;
6753                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6754                                         LWLockRelease(XidGenLock);
6755                                 }
6756
6757                                 /*
6758                                  * Before replaying this record, check if this record causes
6759                                  * the current timeline to change. The record is already
6760                                  * considered to be part of the new timeline, so we update
6761                                  * ThisTimeLineID before replaying it. That's important so
6762                                  * that replayEndTLI, which is recorded as the minimum
6763                                  * recovery point's TLI if recovery stops after this record,
6764                                  * is set correctly.
6765                                  */
6766                                 if (record->xl_rmid == RM_XLOG_ID)
6767                                 {
6768                                         TimeLineID      newTLI = ThisTimeLineID;
6769                                         TimeLineID      prevTLI = ThisTimeLineID;
6770                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6771
6772                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
6773                                         {
6774                                                 CheckPoint      checkPoint;
6775
6776                                                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6777                                                 newTLI = checkPoint.ThisTimeLineID;
6778                                                 prevTLI = checkPoint.PrevTimeLineID;
6779                                         }
6780                                         else if (info == XLOG_END_OF_RECOVERY)
6781                                         {
6782                                                 xl_end_of_recovery xlrec;
6783
6784                                                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
6785                                                 newTLI = xlrec.ThisTimeLineID;
6786                                                 prevTLI = xlrec.PrevTimeLineID;
6787                                         }
6788
6789                                         if (newTLI != ThisTimeLineID)
6790                                         {
6791                                                 /* Check that it's OK to switch to this TLI */
6792                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
6793
6794                                                 /* Following WAL records should be run with new TLI */
6795                                                 ThisTimeLineID = newTLI;
6796                                                 switchedTLI = true;
6797                                         }
6798                                 }
6799
6800                                 /*
6801                                  * Update shared replayEndRecPtr before replaying this record,
6802                                  * so that XLogFlush will update minRecoveryPoint correctly.
6803                                  */
6804                                 SpinLockAcquire(&xlogctl->info_lck);
6805                                 xlogctl->replayEndRecPtr = EndRecPtr;
6806                                 xlogctl->replayEndTLI = ThisTimeLineID;
6807                                 SpinLockRelease(&xlogctl->info_lck);
6808
6809                                 /*
6810                                  * If we are attempting to enter Hot Standby mode, process
6811                                  * XIDs we see
6812                                  */
6813                                 if (standbyState >= STANDBY_INITIALIZED &&
6814                                         TransactionIdIsValid(record->xl_xid))
6815                                         RecordKnownAssignedTransactionIds(record->xl_xid);
6816
6817                                 /* Now apply the WAL record itself */
6818                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
6819
6820                                 /* Pop the error context stack */
6821                                 error_context_stack = errcallback.previous;
6822
6823                                 /*
6824                                  * Update lastReplayedEndRecPtr after this record has been
6825                                  * successfully replayed.
6826                                  */
6827                                 SpinLockAcquire(&xlogctl->info_lck);
6828                                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
6829                                 xlogctl->lastReplayedTLI = ThisTimeLineID;
6830                                 SpinLockRelease(&xlogctl->info_lck);
6831
6832                                 /* Remember this record as the last-applied one */
6833                                 LastRec = ReadRecPtr;
6834
6835                                 /* Allow read-only connections if we're consistent now */
6836                                 CheckRecoveryConsistency();
6837
6838                                 /*
6839                                  * If this record was a timeline switch, wake up any
6840                                  * walsenders to notice that we are on a new timeline.
6841                                  */
6842                                 if (switchedTLI && AllowCascadeReplication())
6843                                         WalSndWakeup();
6844
6845                                 /* Exit loop if we reached inclusive recovery target */
6846                                 if (!recoveryContinue)
6847                                         break;
6848
6849                                 /* Else, try to fetch the next WAL record */
6850                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6851                         } while (record != NULL);
6852
6853                         /*
6854                          * end of main redo apply loop
6855                          */
6856
6857                         ereport(LOG,
6858                                         (errmsg("redo done at %X/%X",
6859                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6860                         xtime = GetLatestXTime();
6861                         if (xtime)
6862                                 ereport(LOG,
6863                                          (errmsg("last completed transaction was at log time %s",
6864                                                          timestamptz_to_str(xtime))));
6865                         InRedo = false;
6866                 }
6867                 else
6868                 {
6869                         /* there are no WAL records following the checkpoint */
6870                         ereport(LOG,
6871                                         (errmsg("redo is not required")));
6872                 }
6873         }
6874
6875         /*
6876          * Kill WAL receiver, if it's still running, before we continue to write
6877          * the startup checkpoint record. It will trump over the checkpoint and
6878          * subsequent records if it's still alive when we start writing WAL.
6879          */
6880         ShutdownWalRcv();
6881
6882         /*
6883          * We don't need the latch anymore. It's not strictly necessary to disown
6884          * it, but let's do it for the sake of tidiness.
6885          */
6886         if (StandbyModeRequested)
6887                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
6888
6889         /*
6890          * We are now done reading the xlog from stream. Turn off streaming
6891          * recovery to force fetching the files (which would be required at end of
6892          * recovery, e.g., timeline history file) from archive or pg_xlog.
6893          */
6894         StandbyMode = false;
6895
6896         /*
6897          * Re-fetch the last valid or last applied record, so we can identify the
6898          * exact endpoint of what we consider the valid portion of WAL.
6899          */
6900         record = ReadRecord(xlogreader, LastRec, PANIC, false);
6901         EndOfLog = EndRecPtr;
6902         XLByteToPrevSeg(EndOfLog, endLogSegNo);
6903
6904         /*
6905          * Complain if we did not roll forward far enough to render the backup
6906          * dump consistent.  Note: it is indeed okay to look at the local variable
6907          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
6908          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
6909          * advanced beyond the WAL we processed.
6910          */
6911         if (InRecovery &&
6912                 (EndOfLog < minRecoveryPoint ||
6913                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6914         {
6915                 if (reachedStopPoint)
6916                 {
6917                         /* stopped because of stop request */
6918                         ereport(FATAL,
6919                                         (errmsg("requested recovery stop point is before consistent recovery point")));
6920                 }
6921
6922                 /*
6923                  * Ran off end of WAL before reaching end-of-backup WAL record, or
6924                  * minRecoveryPoint. That's usually a bad sign, indicating that you
6925                  * tried to recover from an online backup but never called
6926                  * pg_stop_backup(), or you didn't archive all the WAL up to that
6927                  * point. However, this also happens in crash recovery, if the system
6928                  * crashes while an online backup is in progress. We must not treat
6929                  * that as an error, or the database will refuse to start up.
6930                  */
6931                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
6932                 {
6933                         if (ControlFile->backupEndRequired)
6934                                 ereport(FATAL,
6935                                                 (errmsg("WAL ends before end of online backup"),
6936                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
6937                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6938                                 ereport(FATAL,
6939                                                 (errmsg("WAL ends before end of online backup"),
6940                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
6941                         else
6942                                 ereport(FATAL,
6943                                           (errmsg("WAL ends before consistent recovery point")));
6944                 }
6945         }
6946
6947         /*
6948          * Consider whether we need to assign a new timeline ID.
6949          *
6950          * If we are doing an archive recovery, we always assign a new ID.      This
6951          * handles a couple of issues.  If we stopped short of the end of WAL
6952          * during recovery, then we are clearly generating a new timeline and must
6953          * assign it a unique new ID.  Even if we ran to the end, modifying the
6954          * current last segment is problematic because it may result in trying to
6955          * overwrite an already-archived copy of that segment, and we encourage
6956          * DBAs to make their archive_commands reject that.  We can dodge the
6957          * problem by making the new active segment have a new timeline ID.
6958          *
6959          * In a normal crash recovery, we can just extend the timeline we were in.
6960          */
6961         PrevTimeLineID = ThisTimeLineID;
6962         if (ArchiveRecoveryRequested)
6963         {
6964                 char            reason[200];
6965
6966                 Assert(InArchiveRecovery);
6967
6968                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
6969                 ereport(LOG,
6970                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
6971
6972                 /*
6973                  * Create a comment for the history file to explain why and where
6974                  * timeline changed.
6975                  */
6976                 if (recoveryTarget == RECOVERY_TARGET_XID)
6977                         snprintf(reason, sizeof(reason),
6978                                          "%s transaction %u",
6979                                          recoveryStopAfter ? "after" : "before",
6980                                          recoveryStopXid);
6981                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6982                         snprintf(reason, sizeof(reason),
6983                                          "%s %s\n",
6984                                          recoveryStopAfter ? "after" : "before",
6985                                          timestamptz_to_str(recoveryStopTime));
6986                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6987                         snprintf(reason, sizeof(reason),
6988                                          "at restore point \"%s\"",
6989                                          recoveryStopName);
6990                 else
6991                         snprintf(reason, sizeof(reason), "no recovery target specified");
6992
6993                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
6994                                                          EndRecPtr, reason);
6995         }
6996
6997         /* Save the selected TimeLineID in shared memory, too */
6998         XLogCtl->ThisTimeLineID = ThisTimeLineID;
6999         XLogCtl->PrevTimeLineID = PrevTimeLineID;
7000
7001         /*
7002          * We are now done reading the old WAL.  Turn off archive fetching if it
7003          * was active, and make a writable copy of the last WAL segment. (Note
7004          * that we also have a copy of the last block of the old WAL in readBuf;
7005          * we will use that below.)
7006          */
7007         if (ArchiveRecoveryRequested)
7008                 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
7009
7010         /*
7011          * Prepare to write WAL starting at EndOfLog position, and init xlog
7012          * buffer cache using the block containing the last record from the
7013          * previous incarnation.
7014          */
7015         openLogSegNo = endLogSegNo;
7016         openLogFile = XLogFileOpen(openLogSegNo);
7017         openLogOff = 0;
7018         Insert = &XLogCtl->Insert;
7019         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7020         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7021
7022         /*
7023          * Tricky point here: readBuf contains the *last* block that the LastRec
7024          * record spans, not the one it starts in.      The last block is indeed the
7025          * one we want to use.
7026          */
7027         if (EndOfLog % XLOG_BLCKSZ != 0)
7028         {
7029                 char       *page;
7030                 int                     len;
7031                 int                     firstIdx;
7032                 XLogRecPtr      pageBeginPtr;
7033
7034                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7035                 Assert(readOff == pageBeginPtr % XLogSegSize);
7036
7037                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7038
7039                 /* Copy the valid part of the last block, and zero the rest */
7040                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7041                 len = EndOfLog % XLOG_BLCKSZ;
7042                 memcpy(page, xlogreader->readBuf, len);
7043                 memset(page + len, 0, XLOG_BLCKSZ - len);
7044
7045                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7046                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7047         }
7048         else
7049         {
7050                 /*
7051                  * There is no partial block to copy. Just set InitializedUpTo,
7052                  * and let the first attempt to insert a log record to initialize
7053                  * the next buffer.
7054                  */
7055                 XLogCtl->InitializedUpTo = EndOfLog;
7056         }
7057
7058         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7059
7060         XLogCtl->LogwrtResult = LogwrtResult;
7061
7062         XLogCtl->LogwrtRqst.Write = EndOfLog;
7063         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7064
7065         /* Pre-scan prepared transactions to find out the range of XIDs present */
7066         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7067
7068         /*
7069          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7070          * record before resource manager writes cleanup WAL records or checkpoint
7071          * record is written.
7072          */
7073         Insert->fullPageWrites = lastFullPageWrites;
7074         LocalSetXLogInsertAllowed();
7075         UpdateFullPageWrites();
7076         LocalXLogInsertAllowed = -1;
7077
7078         if (InRecovery)
7079         {
7080                 int                     rmid;
7081
7082                 /*
7083                  * Resource managers might need to write WAL records, eg, to record
7084                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
7085                  * this process only.
7086                  */
7087                 LocalSetXLogInsertAllowed();
7088
7089                 /*
7090                  * Allow resource managers to do any required cleanup.
7091                  */
7092                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7093                 {
7094                         if (RmgrTable[rmid].rm_cleanup != NULL)
7095                                 RmgrTable[rmid].rm_cleanup();
7096                 }
7097
7098                 /* Disallow XLogInsert again */
7099                 LocalXLogInsertAllowed = -1;
7100
7101                 /*
7102                  * Perform a checkpoint to update all our recovery activity to disk.
7103                  *
7104                  * Note that we write a shutdown checkpoint rather than an on-line
7105                  * one. This is not particularly critical, but since we may be
7106                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7107                  * the rule that TLI only changes in shutdown checkpoints, which
7108                  * allows some extra error checking in xlog_redo.
7109                  *
7110                  * In fast promotion, only create a lightweight end-of-recovery record
7111                  * instead of a full checkpoint. A checkpoint is requested later,
7112                  * after we're fully out of recovery mode and already accepting
7113                  * queries.
7114                  */
7115                 if (bgwriterLaunched)
7116                 {
7117                         if (fast_promote)
7118                         {
7119                                 checkPointLoc = ControlFile->prevCheckPoint;
7120
7121                                 /*
7122                                  * Confirm the last checkpoint is available for us to recover
7123                                  * from if we fail. Note that we don't check for the secondary
7124                                  * checkpoint since that isn't available in most base backups.
7125                                  */
7126                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7127                                 if (record != NULL)
7128                                 {
7129                                         fast_promoted = true;
7130
7131                                         /*
7132                                          * Insert a special WAL record to mark the end of
7133                                          * recovery, since we aren't doing a checkpoint. That
7134                                          * means that the checkpointer process may likely be in
7135                                          * the middle of a time-smoothed restartpoint and could
7136                                          * continue to be for minutes after this. That sounds
7137                                          * strange, but the effect is roughly the same and it
7138                                          * would be stranger to try to come out of the
7139                                          * restartpoint and then checkpoint. We request a
7140                                          * checkpoint later anyway, just for safety.
7141                                          */
7142                                         CreateEndOfRecoveryRecord();
7143                                 }
7144                         }
7145
7146                         if (!fast_promoted)
7147                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7148                                                                   CHECKPOINT_IMMEDIATE |
7149                                                                   CHECKPOINT_WAIT);
7150                 }
7151                 else
7152                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7153
7154                 /*
7155                  * And finally, execute the recovery_end_command, if any.
7156                  */
7157                 if (recoveryEndCommand)
7158                         ExecuteRecoveryCommand(recoveryEndCommand,
7159                                                                    "recovery_end_command",
7160                                                                    true);
7161         }
7162
7163         /*
7164          * Preallocate additional log files, if wanted.
7165          */
7166         PreallocXlogFiles(EndOfLog);
7167
7168         /*
7169          * Reset initial contents of unlogged relations.  This has to be done
7170          * AFTER recovery is complete so that any unlogged relations created
7171          * during recovery also get picked up.
7172          */
7173         if (InRecovery)
7174                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7175
7176         /*
7177          * Okay, we're officially UP.
7178          */
7179         InRecovery = false;
7180
7181         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7182         ControlFile->state = DB_IN_PRODUCTION;
7183         ControlFile->time = (pg_time_t) time(NULL);
7184         UpdateControlFile();
7185         LWLockRelease(ControlFileLock);
7186
7187         /* start the archive_timeout timer running */
7188         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7189
7190         /* also initialize latestCompletedXid, to nextXid - 1 */
7191         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7192         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7193         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7194         LWLockRelease(ProcArrayLock);
7195
7196         /*
7197          * Start up the commit log and subtrans, if not already done for hot
7198          * standby.
7199          */
7200         if (standbyState == STANDBY_DISABLED)
7201         {
7202                 StartupCLOG();
7203                 StartupSUBTRANS(oldestActiveXID);
7204         }
7205
7206         /*
7207          * Perform end of recovery actions for any SLRUs that need it.
7208          */
7209         TrimCLOG();
7210         TrimMultiXact();
7211
7212         /* Reload shared-memory state for prepared transactions */
7213         RecoverPreparedTransactions();
7214
7215         /*
7216          * Shutdown the recovery environment. This must occur after
7217          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7218          */
7219         if (standbyState != STANDBY_DISABLED)
7220                 ShutdownRecoveryTransactionEnvironment();
7221
7222         /* Shut down xlogreader */
7223         if (readFile >= 0)
7224         {
7225                 close(readFile);
7226                 readFile = -1;
7227         }
7228         XLogReaderFree(xlogreader);
7229
7230         /*
7231          * If any of the critical GUCs have changed, log them before we allow
7232          * backends to write WAL.
7233          */
7234         LocalSetXLogInsertAllowed();
7235         XLogReportParameters();
7236
7237         /*
7238          * All done.  Allow backends to write WAL.      (Although the bool flag is
7239          * probably atomic in itself, we use the info_lck here to ensure that
7240          * there are no race conditions concerning visibility of other recent
7241          * updates to shared memory.)
7242          */
7243         {
7244                 /* use volatile pointer to prevent code rearrangement */
7245                 volatile XLogCtlData *xlogctl = XLogCtl;
7246
7247                 SpinLockAcquire(&xlogctl->info_lck);
7248                 xlogctl->SharedRecoveryInProgress = false;
7249                 SpinLockRelease(&xlogctl->info_lck);
7250         }
7251
7252         /*
7253          * If there were cascading standby servers connected to us, nudge any wal
7254          * sender processes to notice that we've been promoted.
7255          */
7256         WalSndWakeup();
7257
7258         /*
7259          * If this was a fast promotion, request an (online) checkpoint now. This
7260          * isn't required for consistency, but the last restartpoint might be far
7261          * back, and in case of a crash, recovering from it might take a longer
7262          * than is appropriate now that we're not in standby mode anymore.
7263          */
7264         if (fast_promoted)
7265                 RequestCheckpoint(CHECKPOINT_FORCE);
7266 }
7267
7268 /*
7269  * Checks if recovery has reached a consistent state. When consistency is
7270  * reached and we have a valid starting standby snapshot, tell postmaster
7271  * that it can start accepting read-only connections.
7272  */
7273 static void
7274 CheckRecoveryConsistency(void)
7275 {
7276         /*
7277          * During crash recovery, we don't reach a consistent state until we've
7278          * replayed all the WAL.
7279          */
7280         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7281                 return;
7282
7283         /*
7284          * Have we reached the point where our base backup was completed?
7285          */
7286         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7287                 ControlFile->backupEndPoint <= EndRecPtr)
7288         {
7289                 /*
7290                  * We have reached the end of base backup, as indicated by pg_control.
7291                  * The data on disk is now consistent. Reset backupStartPoint and
7292                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7293                  * allow starting up at an earlier point even if recovery is stopped
7294                  * and restarted soon after this.
7295                  */
7296                 elog(DEBUG1, "end of backup reached");
7297
7298                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7299
7300                 if (ControlFile->minRecoveryPoint < EndRecPtr)
7301                         ControlFile->minRecoveryPoint = EndRecPtr;
7302
7303                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7304                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7305                 ControlFile->backupEndRequired = false;
7306                 UpdateControlFile();
7307
7308                 LWLockRelease(ControlFileLock);
7309         }
7310
7311         /*
7312          * Have we passed our safe starting point? Note that minRecoveryPoint is
7313          * known to be incorrectly set if ControlFile->backupEndRequired, until
7314          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7315          * minRecoveryPoint. All we know prior to that is that we're not
7316          * consistent yet.
7317          */
7318         if (!reachedConsistency && !ControlFile->backupEndRequired &&
7319                 minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr &&
7320                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7321         {
7322                 /*
7323                  * Check to see if the XLOG sequence contained any unresolved
7324                  * references to uninitialized pages.
7325                  */
7326                 XLogCheckInvalidPages();
7327
7328                 reachedConsistency = true;
7329                 ereport(LOG,
7330                                 (errmsg("consistent recovery state reached at %X/%X",
7331                                                 (uint32) (XLogCtl->lastReplayedEndRecPtr >> 32),
7332                                                 (uint32) XLogCtl->lastReplayedEndRecPtr)));
7333         }
7334
7335         /*
7336          * Have we got a valid starting snapshot that will allow queries to be
7337          * run? If so, we can tell postmaster that the database is consistent now,
7338          * enabling connections.
7339          */
7340         if (standbyState == STANDBY_SNAPSHOT_READY &&
7341                 !LocalHotStandbyActive &&
7342                 reachedConsistency &&
7343                 IsUnderPostmaster)
7344         {
7345                 /* use volatile pointer to prevent code rearrangement */
7346                 volatile XLogCtlData *xlogctl = XLogCtl;
7347
7348                 SpinLockAcquire(&xlogctl->info_lck);
7349                 xlogctl->SharedHotStandbyActive = true;
7350                 SpinLockRelease(&xlogctl->info_lck);
7351
7352                 LocalHotStandbyActive = true;
7353
7354                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7355         }
7356 }
7357
7358 /*
7359  * Is the system still in recovery?
7360  *
7361  * Unlike testing InRecovery, this works in any process that's connected to
7362  * shared memory.
7363  *
7364  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7365  * variables the first time we see that recovery is finished.
7366  */
7367 bool
7368 RecoveryInProgress(void)
7369 {
7370         /*
7371          * We check shared state each time only until we leave recovery mode. We
7372          * can't re-enter recovery, so there's no need to keep checking after the
7373          * shared variable has once been seen false.
7374          */
7375         if (!LocalRecoveryInProgress)
7376                 return false;
7377         else
7378         {
7379                 /*
7380                  * use volatile pointer to make sure we make a fresh read of the
7381                  * shared variable.
7382                  */
7383                 volatile XLogCtlData *xlogctl = XLogCtl;
7384
7385                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7386
7387                 /*
7388                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7389                  * is finished. InitPostgres() relies upon this behaviour to ensure
7390                  * that InitXLOGAccess() is called at backend startup.  (If you change
7391                  * this, see also LocalSetXLogInsertAllowed.)
7392                  */
7393                 if (!LocalRecoveryInProgress)
7394                 {
7395                         /*
7396                          * If we just exited recovery, make sure we read TimeLineID and
7397                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
7398                          * weak memory ordering).
7399                          */
7400                         pg_memory_barrier();
7401                         InitXLOGAccess();
7402                 }
7403                 /*
7404                  * Note: We don't need a memory barrier when we're still in recovery.
7405                  * We might exit recovery immediately after return, so the caller
7406                  * can't rely on 'true' meaning that we're still in recovery anyway.
7407                  */
7408
7409                 return LocalRecoveryInProgress;
7410         }
7411 }
7412
7413 /*
7414  * Is HotStandby active yet? This is only important in special backends
7415  * since normal backends won't ever be able to connect until this returns
7416  * true. Postmaster knows this by way of signal, not via shared memory.
7417  *
7418  * Unlike testing standbyState, this works in any process that's connected to
7419  * shared memory.
7420  */
7421 bool
7422 HotStandbyActive(void)
7423 {
7424         /*
7425          * We check shared state each time only until Hot Standby is active. We
7426          * can't de-activate Hot Standby, so there's no need to keep checking
7427          * after the shared variable has once been seen true.
7428          */
7429         if (LocalHotStandbyActive)
7430                 return true;
7431         else
7432         {
7433                 /* use volatile pointer to prevent code rearrangement */
7434                 volatile XLogCtlData *xlogctl = XLogCtl;
7435
7436                 /* spinlock is essential on machines with weak memory ordering! */
7437                 SpinLockAcquire(&xlogctl->info_lck);
7438                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
7439                 SpinLockRelease(&xlogctl->info_lck);
7440
7441                 return LocalHotStandbyActive;
7442         }
7443 }
7444
7445 /*
7446  * Is this process allowed to insert new WAL records?
7447  *
7448  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7449  * But we also have provisions for forcing the result "true" or "false"
7450  * within specific processes regardless of the global state.
7451  */
7452 bool
7453 XLogInsertAllowed(void)
7454 {
7455         /*
7456          * If value is "unconditionally true" or "unconditionally false", just
7457          * return it.  This provides the normal fast path once recovery is known
7458          * done.
7459          */
7460         if (LocalXLogInsertAllowed >= 0)
7461                 return (bool) LocalXLogInsertAllowed;
7462
7463         /*
7464          * Else, must check to see if we're still in recovery.
7465          */
7466         if (RecoveryInProgress())
7467                 return false;
7468
7469         /*
7470          * On exit from recovery, reset to "unconditionally true", since there is
7471          * no need to keep checking.
7472          */
7473         LocalXLogInsertAllowed = 1;
7474         return true;
7475 }
7476
7477 /*
7478  * Make XLogInsertAllowed() return true in the current process only.
7479  *
7480  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7481  * and even call LocalSetXLogInsertAllowed() again after that.
7482  */
7483 static void
7484 LocalSetXLogInsertAllowed(void)
7485 {
7486         Assert(LocalXLogInsertAllowed == -1);
7487         LocalXLogInsertAllowed = 1;
7488
7489         /* Initialize as RecoveryInProgress() would do when switching state */
7490         InitXLOGAccess();
7491 }
7492
7493 /*
7494  * Subroutine to try to fetch and validate a prior checkpoint record.
7495  *
7496  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7497  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7498  */
7499 static XLogRecord *
7500 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7501                                          int whichChkpt, bool report)
7502 {
7503         XLogRecord *record;
7504
7505         if (!XRecOffIsValid(RecPtr))
7506         {
7507                 if (!report)
7508                         return NULL;
7509
7510                 switch (whichChkpt)
7511                 {
7512                         case 1:
7513                                 ereport(LOG,
7514                                 (errmsg("invalid primary checkpoint link in control file")));
7515                                 break;
7516                         case 2:
7517                                 ereport(LOG,
7518                                                 (errmsg("invalid secondary checkpoint link in control file")));
7519                                 break;
7520                         default:
7521                                 ereport(LOG,
7522                                    (errmsg("invalid checkpoint link in backup_label file")));
7523                                 break;
7524                 }
7525                 return NULL;
7526         }
7527
7528         record = ReadRecord(xlogreader, RecPtr, LOG, true);
7529
7530         if (record == NULL)
7531         {
7532                 if (!report)
7533                         return NULL;
7534
7535                 switch (whichChkpt)
7536                 {
7537                         case 1:
7538                                 ereport(LOG,
7539                                                 (errmsg("invalid primary checkpoint record")));
7540                                 break;
7541                         case 2:
7542                                 ereport(LOG,
7543                                                 (errmsg("invalid secondary checkpoint record")));
7544                                 break;
7545                         default:
7546                                 ereport(LOG,
7547                                                 (errmsg("invalid checkpoint record")));
7548                                 break;
7549                 }
7550                 return NULL;
7551         }
7552         if (record->xl_rmid != RM_XLOG_ID)
7553         {
7554                 switch (whichChkpt)
7555                 {
7556                         case 1:
7557                                 ereport(LOG,
7558                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7559                                 break;
7560                         case 2:
7561                                 ereport(LOG,
7562                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7563                                 break;
7564                         default:
7565                                 ereport(LOG,
7566                                 (errmsg("invalid resource manager ID in checkpoint record")));
7567                                 break;
7568                 }
7569                 return NULL;
7570         }
7571         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7572                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7573         {
7574                 switch (whichChkpt)
7575                 {
7576                         case 1:
7577                                 ereport(LOG,
7578                                    (errmsg("invalid xl_info in primary checkpoint record")));
7579                                 break;
7580                         case 2:
7581                                 ereport(LOG,
7582                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7583                                 break;
7584                         default:
7585                                 ereport(LOG,
7586                                                 (errmsg("invalid xl_info in checkpoint record")));
7587                                 break;
7588                 }
7589                 return NULL;
7590         }
7591         if (record->xl_len != sizeof(CheckPoint) ||
7592                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
7593         {
7594                 switch (whichChkpt)
7595                 {
7596                         case 1:
7597                                 ereport(LOG,
7598                                         (errmsg("invalid length of primary checkpoint record")));
7599                                 break;
7600                         case 2:
7601                                 ereport(LOG,
7602                                   (errmsg("invalid length of secondary checkpoint record")));
7603                                 break;
7604                         default:
7605                                 ereport(LOG,
7606                                                 (errmsg("invalid length of checkpoint record")));
7607                                 break;
7608                 }
7609                 return NULL;
7610         }
7611         return record;
7612 }
7613
7614 /*
7615  * This must be called during startup of a backend process, except that
7616  * it need not be called in a standalone backend (which does StartupXLOG
7617  * instead).  We need to initialize the local copies of ThisTimeLineID and
7618  * RedoRecPtr.
7619  *
7620  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7621  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7622  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7623  */
7624 void
7625 InitXLOGAccess(void)
7626 {
7627         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7628         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7629         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7630
7631         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7632         (void) GetRedoRecPtr();
7633 }
7634
7635 /*
7636  * Return the current Redo pointer from shared memory.
7637  *
7638  * As a side-effect, the local RedoRecPtr copy is updated.
7639  */
7640 XLogRecPtr
7641 GetRedoRecPtr(void)
7642 {
7643         /* use volatile pointer to prevent code rearrangement */
7644         volatile XLogCtlData *xlogctl = XLogCtl;
7645         XLogRecPtr ptr;
7646
7647         /*
7648          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
7649          * grabbed a WAL insertion slot to read the master copy, someone might
7650          * update it just after we've released the lock.
7651          */
7652         SpinLockAcquire(&xlogctl->info_lck);
7653         ptr = xlogctl->RedoRecPtr;
7654         SpinLockRelease(&xlogctl->info_lck);
7655
7656         if (RedoRecPtr < ptr)
7657                 RedoRecPtr = ptr;
7658
7659         return RedoRecPtr;
7660 }
7661
7662 /*
7663  * GetInsertRecPtr -- Returns the current insert position.
7664  *
7665  * NOTE: The value *actually* returned is the position of the last full
7666  * xlog page. It lags behind the real insert position by at most 1 page.
7667  * For that, we don't need to scan through WAL insertion slots, and an
7668  * approximation is enough for the current usage of this function.
7669  */
7670 XLogRecPtr
7671 GetInsertRecPtr(void)
7672 {
7673         /* use volatile pointer to prevent code rearrangement */
7674         volatile XLogCtlData *xlogctl = XLogCtl;
7675         XLogRecPtr      recptr;
7676
7677         SpinLockAcquire(&xlogctl->info_lck);
7678         recptr = xlogctl->LogwrtRqst.Write;
7679         SpinLockRelease(&xlogctl->info_lck);
7680
7681         return recptr;
7682 }
7683
7684 /*
7685  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7686  * position known to be fsync'd to disk.
7687  */
7688 XLogRecPtr
7689 GetFlushRecPtr(void)
7690 {
7691         /* use volatile pointer to prevent code rearrangement */
7692         volatile XLogCtlData *xlogctl = XLogCtl;
7693         XLogRecPtr      recptr;
7694
7695         SpinLockAcquire(&xlogctl->info_lck);
7696         recptr = xlogctl->LogwrtResult.Flush;
7697         SpinLockRelease(&xlogctl->info_lck);
7698
7699         return recptr;
7700 }
7701
7702 /*
7703  * Get the time of the last xlog segment switch
7704  */
7705 pg_time_t
7706 GetLastSegSwitchTime(void)
7707 {
7708         pg_time_t       result;
7709
7710         /* Need WALWriteLock, but shared lock is sufficient */
7711         LWLockAcquire(WALWriteLock, LW_SHARED);
7712         result = XLogCtl->lastSegSwitchTime;
7713         LWLockRelease(WALWriteLock);
7714
7715         return result;
7716 }
7717
7718 /*
7719  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7720  *
7721  * This is exported for use by code that would like to have 64-bit XIDs.
7722  * We don't really support such things, but all XIDs within the system
7723  * can be presumed "close to" the result, and thus the epoch associated
7724  * with them can be determined.
7725  */
7726 void
7727 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7728 {
7729         uint32          ckptXidEpoch;
7730         TransactionId ckptXid;
7731         TransactionId nextXid;
7732
7733         /* Must read checkpoint info first, else have race condition */
7734         {
7735                 /* use volatile pointer to prevent code rearrangement */
7736                 volatile XLogCtlData *xlogctl = XLogCtl;
7737
7738                 SpinLockAcquire(&xlogctl->info_lck);
7739                 ckptXidEpoch = xlogctl->ckptXidEpoch;
7740                 ckptXid = xlogctl->ckptXid;
7741                 SpinLockRelease(&xlogctl->info_lck);
7742         }
7743
7744         /* Now fetch current nextXid */
7745         nextXid = ReadNewTransactionId();
7746
7747         /*
7748          * nextXid is certainly logically later than ckptXid.  So if it's
7749          * numerically less, it must have wrapped into the next epoch.
7750          */
7751         if (nextXid < ckptXid)
7752                 ckptXidEpoch++;
7753
7754         *xid = nextXid;
7755         *epoch = ckptXidEpoch;
7756 }
7757
7758 /*
7759  * This must be called ONCE during postmaster or standalone-backend shutdown
7760  */
7761 void
7762 ShutdownXLOG(int code, Datum arg)
7763 {
7764         /* Don't be chatty in standalone mode */
7765         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7766                         (errmsg("shutting down")));
7767
7768         if (RecoveryInProgress())
7769                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7770         else
7771         {
7772                 /*
7773                  * If archiving is enabled, rotate the last XLOG file so that all the
7774                  * remaining records are archived (postmaster wakes up the archiver
7775                  * process one more time at the end of shutdown). The checkpoint
7776                  * record will go to the next XLOG file and won't be archived (yet).
7777                  */
7778                 if (XLogArchivingActive() && XLogArchiveCommandSet())
7779                         RequestXLogSwitch();
7780
7781                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7782         }
7783         ShutdownCLOG();
7784         ShutdownSUBTRANS();
7785         ShutdownMultiXact();
7786
7787         /* Don't be chatty in standalone mode */
7788         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7789                         (errmsg("database system is shut down")));
7790 }
7791
7792 /*
7793  * Log start of a checkpoint.
7794  */
7795 static void
7796 LogCheckpointStart(int flags, bool restartpoint)
7797 {
7798         const char *msg;
7799
7800         /*
7801          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
7802          * the main message, but what about all the flags?
7803          */
7804         if (restartpoint)
7805                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
7806         else
7807                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
7808
7809         elog(LOG, msg,
7810                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7811                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7812                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
7813                  (flags & CHECKPOINT_FORCE) ? " force" : "",
7814                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
7815                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
7816                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
7817 }
7818
7819 /*
7820  * Log end of a checkpoint.
7821  */
7822 static void
7823 LogCheckpointEnd(bool restartpoint)
7824 {
7825         long            write_secs,
7826                                 sync_secs,
7827                                 total_secs,
7828                                 longest_secs,
7829                                 average_secs;
7830         int                     write_usecs,
7831                                 sync_usecs,
7832                                 total_usecs,
7833                                 longest_usecs,
7834                                 average_usecs;
7835         uint64          average_sync_time;
7836
7837         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
7838
7839         TimestampDifference(CheckpointStats.ckpt_write_t,
7840                                                 CheckpointStats.ckpt_sync_t,
7841                                                 &write_secs, &write_usecs);
7842
7843         TimestampDifference(CheckpointStats.ckpt_sync_t,
7844                                                 CheckpointStats.ckpt_sync_end_t,
7845                                                 &sync_secs, &sync_usecs);
7846
7847         /* Accumulate checkpoint timing summary data, in milliseconds. */
7848         BgWriterStats.m_checkpoint_write_time +=
7849                 write_secs * 1000 + write_usecs / 1000;
7850         BgWriterStats.m_checkpoint_sync_time +=
7851                 sync_secs * 1000 + sync_usecs / 1000;
7852
7853         /*
7854          * All of the published timing statistics are accounted for.  Only
7855          * continue if a log message is to be written.
7856          */
7857         if (!log_checkpoints)
7858                 return;
7859
7860         TimestampDifference(CheckpointStats.ckpt_start_t,
7861                                                 CheckpointStats.ckpt_end_t,
7862                                                 &total_secs, &total_usecs);
7863
7864         /*
7865          * Timing values returned from CheckpointStats are in microseconds.
7866          * Convert to the second plus microsecond form that TimestampDifference
7867          * returns for homogeneous printing.
7868          */
7869         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
7870         longest_usecs = CheckpointStats.ckpt_longest_sync -
7871                 (uint64) longest_secs *1000000;
7872
7873         average_sync_time = 0;
7874         if (CheckpointStats.ckpt_sync_rels > 0)
7875                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
7876                         CheckpointStats.ckpt_sync_rels;
7877         average_secs = (long) (average_sync_time / 1000000);
7878         average_usecs = average_sync_time - (uint64) average_secs *1000000;
7879
7880         if (restartpoint)
7881                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
7882                          "%d transaction log file(s) added, %d removed, %d recycled; "
7883                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7884                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7885                          CheckpointStats.ckpt_bufs_written,
7886                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7887                          CheckpointStats.ckpt_segs_added,
7888                          CheckpointStats.ckpt_segs_removed,
7889                          CheckpointStats.ckpt_segs_recycled,
7890                          write_secs, write_usecs / 1000,
7891                          sync_secs, sync_usecs / 1000,
7892                          total_secs, total_usecs / 1000,
7893                          CheckpointStats.ckpt_sync_rels,
7894                          longest_secs, longest_usecs / 1000,
7895                          average_secs, average_usecs / 1000);
7896         else
7897                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
7898                          "%d transaction log file(s) added, %d removed, %d recycled; "
7899                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7900                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7901                          CheckpointStats.ckpt_bufs_written,
7902                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7903                          CheckpointStats.ckpt_segs_added,
7904                          CheckpointStats.ckpt_segs_removed,
7905                          CheckpointStats.ckpt_segs_recycled,
7906                          write_secs, write_usecs / 1000,
7907                          sync_secs, sync_usecs / 1000,
7908                          total_secs, total_usecs / 1000,
7909                          CheckpointStats.ckpt_sync_rels,
7910                          longest_secs, longest_usecs / 1000,
7911                          average_secs, average_usecs / 1000);
7912 }
7913
7914 /*
7915  * Perform a checkpoint --- either during shutdown, or on-the-fly
7916  *
7917  * flags is a bitwise OR of the following:
7918  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7919  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7920  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7921  *              ignoring checkpoint_completion_target parameter.
7922  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
7923  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7924  *              CHECKPOINT_END_OF_RECOVERY).
7925  *
7926  * Note: flags contains other bits, of interest here only for logging purposes.
7927  * In particular note that this routine is synchronous and does not pay
7928  * attention to CHECKPOINT_WAIT.
7929  *
7930  * If !shutdown then we are writing an online checkpoint. This is a very special
7931  * kind of operation and WAL record because the checkpoint action occurs over
7932  * a period of time yet logically occurs at just a single LSN. The logical
7933  * position of the WAL record (redo ptr) is the same or earlier than the
7934  * physical position. When we replay WAL we locate the checkpoint via its
7935  * physical position then read the redo ptr and actually start replay at the
7936  * earlier logical position. Note that we don't write *anything* to WAL at
7937  * the logical position, so that location could be any other kind of WAL record.
7938  * All of this mechanism allows us to continue working while we checkpoint.
7939  * As a result, timing of actions is critical here and be careful to note that
7940  * this function will likely take minutes to execute on a busy system.
7941  */
7942 void
7943 CreateCheckPoint(int flags)
7944 {
7945         /* use volatile pointer to prevent code rearrangement */
7946         volatile XLogCtlData *xlogctl = XLogCtl;
7947         bool            shutdown;
7948         CheckPoint      checkPoint;
7949         XLogRecPtr      recptr;
7950         XLogCtlInsert *Insert = &XLogCtl->Insert;
7951         XLogRecData rdata;
7952         uint32          freespace;
7953         XLogSegNo       _logSegNo;
7954         XLogRecPtr      curInsert;
7955         VirtualTransactionId *vxids;
7956         int                     nvxids;
7957
7958         /*
7959          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7960          * issued at a different time.
7961          */
7962         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7963                 shutdown = true;
7964         else
7965                 shutdown = false;
7966
7967         /* sanity check */
7968         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7969                 elog(ERROR, "can't create a checkpoint during recovery");
7970
7971         /*
7972          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
7973          * (This is just pro forma, since in the present system structure there is
7974          * only one process that is allowed to issue checkpoints at any given
7975          * time.)
7976          */
7977         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7978
7979         /*
7980          * Prepare to accumulate statistics.
7981          *
7982          * Note: because it is possible for log_checkpoints to change while a
7983          * checkpoint proceeds, we always accumulate stats, even if
7984          * log_checkpoints is currently off.
7985          */
7986         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7987         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7988
7989         /*
7990          * Use a critical section to force system panic if we have trouble.
7991          */
7992         START_CRIT_SECTION();
7993
7994         if (shutdown)
7995         {
7996                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7997                 ControlFile->state = DB_SHUTDOWNING;
7998                 ControlFile->time = (pg_time_t) time(NULL);
7999                 UpdateControlFile();
8000                 LWLockRelease(ControlFileLock);
8001         }
8002
8003         /*
8004          * Let smgr prepare for checkpoint; this has to happen before we determine
8005          * the REDO pointer.  Note that smgr must not do anything that'd have to
8006          * be undone if we decide no checkpoint is needed.
8007          */
8008         smgrpreckpt();
8009
8010         /* Begin filling in the checkpoint WAL record */
8011         MemSet(&checkPoint, 0, sizeof(checkPoint));
8012         checkPoint.time = (pg_time_t) time(NULL);
8013
8014         /*
8015          * For Hot Standby, derive the oldestActiveXid before we fix the redo
8016          * pointer. This allows us to begin accumulating changes to assemble our
8017          * starting snapshot of locks and transactions.
8018          */
8019         if (!shutdown && XLogStandbyInfoActive())
8020                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8021         else
8022                 checkPoint.oldestActiveXid = InvalidTransactionId;
8023
8024         /*
8025          * We must block concurrent insertions while examining insert state to
8026          * determine the checkpoint REDO pointer.
8027          */
8028         WALInsertSlotAcquire(true);
8029         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8030
8031         /*
8032          * If this isn't a shutdown or forced checkpoint, and we have not inserted
8033          * any XLOG records since the start of the last checkpoint, skip the
8034          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
8035          * when the system is idle. That wastes log space, and more importantly it
8036          * exposes us to possible loss of both current and previous checkpoint
8037          * records if the machine crashes just as we're writing the update.
8038          * (Perhaps it'd make even more sense to checkpoint only when the previous
8039          * checkpoint record is in a different xlog page?)
8040          *
8041          * We have to make two tests to determine that nothing has happened since
8042          * the start of the last checkpoint: current insertion point must match
8043          * the end of the last checkpoint record, and its redo pointer must point
8044          * to itself.
8045          */
8046         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8047                                   CHECKPOINT_FORCE)) == 0)
8048         {
8049                 if (curInsert == ControlFile->checkPoint +
8050                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
8051                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
8052                 {
8053                         WALInsertSlotRelease();
8054                         LWLockRelease(CheckpointLock);
8055                         END_CRIT_SECTION();
8056                         return;
8057                 }
8058         }
8059
8060         /*
8061          * An end-of-recovery checkpoint is created before anyone is allowed to
8062          * write WAL. To allow us to write the checkpoint record, temporarily
8063          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8064          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8065          */
8066         if (flags & CHECKPOINT_END_OF_RECOVERY)
8067                 LocalSetXLogInsertAllowed();
8068
8069         checkPoint.ThisTimeLineID = ThisTimeLineID;
8070         if (flags & CHECKPOINT_END_OF_RECOVERY)
8071                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8072         else
8073                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8074
8075         checkPoint.fullPageWrites = Insert->fullPageWrites;
8076
8077         /*
8078          * Compute new REDO record ptr = location of next XLOG record.
8079          *
8080          * NB: this is NOT necessarily where the checkpoint record itself will be,
8081          * since other backends may insert more XLOG records while we're off doing
8082          * the buffer flush work.  Those XLOG records are logically after the
8083          * checkpoint, even though physically before it.  Got that?
8084          */
8085         freespace = INSERT_FREESPACE(curInsert);
8086         if (freespace == 0)
8087         {
8088                 if (curInsert % XLogSegSize == 0)
8089                         curInsert += SizeOfXLogLongPHD;
8090                 else
8091                         curInsert += SizeOfXLogShortPHD;
8092         }
8093         checkPoint.redo = curInsert;
8094
8095         /*
8096          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8097          * must be done while holding the insertion slots.
8098          *
8099          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8100          * pointing past where it really needs to point.  This is okay; the only
8101          * consequence is that XLogInsert might back up whole buffers that it
8102          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8103          * XLogInserts that happen while we are dumping buffers must assume that
8104          * their buffer changes are not included in the checkpoint.
8105          */
8106         RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
8107
8108         /*
8109          * Now we can release the WAL insertion slots, allowing other xacts to
8110          * proceed while we are flushing disk buffers.
8111          */
8112         WALInsertSlotRelease();
8113
8114         /* Update the info_lck-protected copy of RedoRecPtr as well */
8115         SpinLockAcquire(&xlogctl->info_lck);
8116         xlogctl->RedoRecPtr = checkPoint.redo;
8117         SpinLockRelease(&xlogctl->info_lck);
8118
8119         /*
8120          * If enabled, log checkpoint start.  We postpone this until now so as not
8121          * to log anything if we decided to skip the checkpoint.
8122          */
8123         if (log_checkpoints)
8124                 LogCheckpointStart(flags, false);
8125
8126         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8127
8128         /*
8129          * In some cases there are groups of actions that must all occur on one
8130          * side or the other of a checkpoint record. Before flushing the
8131          * checkpoint record we must explicitly wait for any backend currently
8132          * performing those groups of actions.
8133          *
8134          * One example is end of transaction, so we must wait for any transactions
8135          * that are currently in commit critical sections.      If an xact inserted
8136          * its commit record into XLOG just before the REDO point, then a crash
8137          * restart from the REDO point would not replay that record, which means
8138          * that our flushing had better include the xact's update of pg_clog.  So
8139          * we wait till he's out of his commit critical section before proceeding.
8140          * See notes in RecordTransactionCommit().
8141          *
8142          * Because we've already released the insertion slots, this test is a bit
8143          * fuzzy: it is possible that we will wait for xacts we didn't really need
8144          * to wait for.  But the delay should be short and it seems better to make
8145          * checkpoint take a bit longer than to hold off insertions longer than
8146          * necessary.
8147          * (In fact, the whole reason we have this issue is that xact.c does
8148          * commit record XLOG insertion and clog update as two separate steps
8149          * protected by different locks, but again that seems best on grounds of
8150          * minimizing lock contention.)
8151          *
8152          * A transaction that has not yet set delayChkpt when we look cannot be at
8153          * risk, since he's not inserted his commit record yet; and one that's
8154          * already cleared it is not at risk either, since he's done fixing clog
8155          * and we will correctly flush the update below.  So we cannot miss any
8156          * xacts we need to wait for.
8157          */
8158         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8159         if (nvxids > 0)
8160         {
8161                 do
8162                 {
8163                         pg_usleep(10000L);      /* wait for 10 msec */
8164                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8165         }
8166         pfree(vxids);
8167
8168         /*
8169          * Get the other info we need for the checkpoint record.
8170          */
8171         LWLockAcquire(XidGenLock, LW_SHARED);
8172         checkPoint.nextXid = ShmemVariableCache->nextXid;
8173         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8174         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8175         LWLockRelease(XidGenLock);
8176
8177         /* Increase XID epoch if we've wrapped around since last checkpoint */
8178         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8179         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8180                 checkPoint.nextXidEpoch++;
8181
8182         LWLockAcquire(OidGenLock, LW_SHARED);
8183         checkPoint.nextOid = ShmemVariableCache->nextOid;
8184         if (!shutdown)
8185                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8186         LWLockRelease(OidGenLock);
8187
8188         MultiXactGetCheckptMulti(shutdown,
8189                                                          &checkPoint.nextMulti,
8190                                                          &checkPoint.nextMultiOffset,
8191                                                          &checkPoint.oldestMulti,
8192                                                          &checkPoint.oldestMultiDB);
8193
8194         /*
8195          * Having constructed the checkpoint record, ensure all shmem disk buffers
8196          * and commit-log buffers are flushed to disk.
8197          *
8198          * This I/O could fail for various reasons.  If so, we will fail to
8199          * complete the checkpoint, but there is no reason to force a system
8200          * panic. Accordingly, exit critical section while doing it.
8201          */
8202         END_CRIT_SECTION();
8203
8204         CheckPointGuts(checkPoint.redo, flags);
8205
8206         /*
8207          * Take a snapshot of running transactions and write this to WAL. This
8208          * allows us to reconstruct the state of running transactions during
8209          * archive recovery, if required. Skip, if this info disabled.
8210          *
8211          * If we are shutting down, or Startup process is completing crash
8212          * recovery we don't need to write running xact data.
8213          */
8214         if (!shutdown && XLogStandbyInfoActive())
8215                 LogStandbySnapshot();
8216
8217         START_CRIT_SECTION();
8218
8219         /*
8220          * Now insert the checkpoint record into XLOG.
8221          */
8222         rdata.data = (char *) (&checkPoint);
8223         rdata.len = sizeof(checkPoint);
8224         rdata.buffer = InvalidBuffer;
8225         rdata.next = NULL;
8226
8227         recptr = XLogInsert(RM_XLOG_ID,
8228                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8229                                                 XLOG_CHECKPOINT_ONLINE,
8230                                                 &rdata);
8231
8232         XLogFlush(recptr);
8233
8234         /*
8235          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8236          * overwritten at next startup.  No-one should even try, this just allows
8237          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8238          * to just temporarily disable writing until the system has exited
8239          * recovery.
8240          */
8241         if (shutdown)
8242         {
8243                 if (flags & CHECKPOINT_END_OF_RECOVERY)
8244                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
8245                 else
8246                         LocalXLogInsertAllowed = 0; /* never again write WAL */
8247         }
8248
8249         /*
8250          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8251          * = end of actual checkpoint record.
8252          */
8253         if (shutdown && checkPoint.redo != ProcLastRecPtr)
8254                 ereport(PANIC,
8255                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
8256
8257         /*
8258          * Select point at which we can truncate the log, which we base on the
8259          * prior checkpoint's earliest info.
8260          */
8261         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8262
8263         /*
8264          * Update the control file.
8265          */
8266         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8267         if (shutdown)
8268                 ControlFile->state = DB_SHUTDOWNED;
8269         ControlFile->prevCheckPoint = ControlFile->checkPoint;
8270         ControlFile->checkPoint = ProcLastRecPtr;
8271         ControlFile->checkPointCopy = checkPoint;
8272         ControlFile->time = (pg_time_t) time(NULL);
8273         /* crash recovery should always recover to the end of WAL */
8274         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8275         ControlFile->minRecoveryPointTLI = 0;
8276
8277         /*
8278          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8279          * unused on non-shutdown checkpoints, but seems useful to store it always
8280          * for debugging purposes.
8281          */
8282         SpinLockAcquire(&XLogCtl->ulsn_lck);
8283         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8284         SpinLockRelease(&XLogCtl->ulsn_lck);
8285
8286         UpdateControlFile();
8287         LWLockRelease(ControlFileLock);
8288
8289         /* Update shared-memory copy of checkpoint XID/epoch */
8290         {
8291                 /* use volatile pointer to prevent code rearrangement */
8292                 volatile XLogCtlData *xlogctl = XLogCtl;
8293
8294                 SpinLockAcquire(&xlogctl->info_lck);
8295                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8296                 xlogctl->ckptXid = checkPoint.nextXid;
8297                 SpinLockRelease(&xlogctl->info_lck);
8298         }
8299
8300         /*
8301          * We are now done with critical updates; no need for system panic if we
8302          * have trouble while fooling with old log segments.
8303          */
8304         END_CRIT_SECTION();
8305
8306         /*
8307          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8308          */
8309         smgrpostckpt();
8310
8311         /*
8312          * Delete old log files (those no longer needed even for previous
8313          * checkpoint or the standbys in XLOG streaming).
8314          */
8315         if (_logSegNo)
8316         {
8317                 KeepLogSeg(recptr, &_logSegNo);
8318                 _logSegNo--;
8319                 RemoveOldXlogFiles(_logSegNo, recptr);
8320         }
8321
8322         /*
8323          * Make more log segments if needed.  (Do this after recycling old log
8324          * segments, since that may supply some of the needed files.)
8325          */
8326         if (!shutdown)
8327                 PreallocXlogFiles(recptr);
8328
8329         /*
8330          * Truncate pg_subtrans if possible.  We can throw away all data before
8331          * the oldest XMIN of any running transaction.  No future transaction will
8332          * attempt to reference any pg_subtrans entry older than that (see Asserts
8333          * in subtrans.c).      During recovery, though, we mustn't do this because
8334          * StartupSUBTRANS hasn't been called yet.
8335          */
8336         if (!RecoveryInProgress())
8337                 TruncateSUBTRANS(GetOldestXmin(true, false));
8338
8339         /* Real work is done, but log and update stats before releasing lock. */
8340         LogCheckpointEnd(false);
8341
8342         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8343                                                                          NBuffers,
8344                                                                          CheckpointStats.ckpt_segs_added,
8345                                                                          CheckpointStats.ckpt_segs_removed,
8346                                                                          CheckpointStats.ckpt_segs_recycled);
8347
8348         LWLockRelease(CheckpointLock);
8349 }
8350
8351 /*
8352  * Mark the end of recovery in WAL though without running a full checkpoint.
8353  * We can expect that a restartpoint is likely to be in progress as we
8354  * do this, though we are unwilling to wait for it to complete. So be
8355  * careful to avoid taking the CheckpointLock anywhere here.
8356  *
8357  * CreateRestartPoint() allows for the case where recovery may end before
8358  * the restartpoint completes so there is no concern of concurrent behaviour.
8359  */
8360 void
8361 CreateEndOfRecoveryRecord(void)
8362 {
8363         xl_end_of_recovery xlrec;
8364         XLogRecData rdata;
8365         XLogRecPtr      recptr;
8366
8367         /* sanity check */
8368         if (!RecoveryInProgress())
8369                 elog(ERROR, "can only be used to end recovery");
8370
8371         xlrec.end_time = time(NULL);
8372
8373         WALInsertSlotAcquire(true);
8374         xlrec.ThisTimeLineID = ThisTimeLineID;
8375         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8376         WALInsertSlotRelease();
8377
8378         LocalSetXLogInsertAllowed();
8379
8380         START_CRIT_SECTION();
8381
8382         rdata.data = (char *) &xlrec;
8383         rdata.len = sizeof(xl_end_of_recovery);
8384         rdata.buffer = InvalidBuffer;
8385         rdata.next = NULL;
8386
8387         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
8388
8389         XLogFlush(recptr);
8390
8391         /*
8392          * Update the control file so that crash recovery can follow the timeline
8393          * changes to this point.
8394          */
8395         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8396         ControlFile->time = (pg_time_t) xlrec.end_time;
8397         ControlFile->minRecoveryPoint = recptr;
8398         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8399         UpdateControlFile();
8400         LWLockRelease(ControlFileLock);
8401
8402         END_CRIT_SECTION();
8403
8404         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8405 }
8406
8407 /*
8408  * Flush all data in shared memory to disk, and fsync
8409  *
8410  * This is the common code shared between regular checkpoints and
8411  * recovery restartpoints.
8412  */
8413 static void
8414 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8415 {
8416         CheckPointCLOG();
8417         CheckPointSUBTRANS();
8418         CheckPointMultiXact();
8419         CheckPointPredicate();
8420         CheckPointRelationMap();
8421         CheckPointBuffers(flags);       /* performs all required fsyncs */
8422         /* We deliberately delay 2PC checkpointing as long as possible */
8423         CheckPointTwoPhase(checkPointRedo);
8424 }
8425
8426 /*
8427  * Save a checkpoint for recovery restart if appropriate
8428  *
8429  * This function is called each time a checkpoint record is read from XLOG.
8430  * It must determine whether the checkpoint represents a safe restartpoint or
8431  * not.  If so, the checkpoint record is stashed in shared memory so that
8432  * CreateRestartPoint can consult it.  (Note that the latter function is
8433  * executed by the checkpointer, while this one will be executed by the
8434  * startup process.)
8435  */
8436 static void
8437 RecoveryRestartPoint(const CheckPoint *checkPoint)
8438 {
8439         int                     rmid;
8440
8441         /* use volatile pointer to prevent code rearrangement */
8442         volatile XLogCtlData *xlogctl = XLogCtl;
8443
8444         /*
8445          * Is it safe to restartpoint?  We must ask each of the resource managers
8446          * whether they have any partial state information that might prevent a
8447          * correct restart from this point.  If so, we skip this opportunity, but
8448          * return at the next checkpoint record for another try.
8449          */
8450         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
8451         {
8452                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
8453                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
8454                         {
8455                                 elog(trace_recovery(DEBUG2),
8456                                          "RM %d not safe to record restart point at %X/%X",
8457                                          rmid,
8458                                          (uint32) (checkPoint->redo >> 32),
8459                                          (uint32) checkPoint->redo);
8460                                 return;
8461                         }
8462         }
8463
8464         /*
8465          * Also refrain from creating a restartpoint if we have seen any
8466          * references to non-existent pages. Restarting recovery from the
8467          * restartpoint would not see the references, so we would lose the
8468          * cross-check that the pages belonged to a relation that was dropped
8469          * later.
8470          */
8471         if (XLogHaveInvalidPages())
8472         {
8473                 elog(trace_recovery(DEBUG2),
8474                          "could not record restart point at %X/%X because there "
8475                          "are unresolved references to invalid pages",
8476                          (uint32) (checkPoint->redo >> 32),
8477                          (uint32) checkPoint->redo);
8478                 return;
8479         }
8480
8481         /*
8482          * Copy the checkpoint record to shared memory, so that checkpointer can
8483          * work out the next time it wants to perform a restartpoint.
8484          */
8485         SpinLockAcquire(&xlogctl->info_lck);
8486         xlogctl->lastCheckPointRecPtr = ReadRecPtr;
8487         xlogctl->lastCheckPoint = *checkPoint;
8488         SpinLockRelease(&xlogctl->info_lck);
8489 }
8490
8491 /*
8492  * Establish a restartpoint if possible.
8493  *
8494  * This is similar to CreateCheckPoint, but is used during WAL recovery
8495  * to establish a point from which recovery can roll forward without
8496  * replaying the entire recovery log.
8497  *
8498  * Returns true if a new restartpoint was established. We can only establish
8499  * a restartpoint if we have replayed a safe checkpoint record since last
8500  * restartpoint.
8501  */
8502 bool
8503 CreateRestartPoint(int flags)
8504 {
8505         XLogRecPtr      lastCheckPointRecPtr;
8506         CheckPoint      lastCheckPoint;
8507         XLogSegNo       _logSegNo;
8508         TimestampTz xtime;
8509
8510         /* use volatile pointer to prevent code rearrangement */
8511         volatile XLogCtlData *xlogctl = XLogCtl;
8512
8513         /*
8514          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8515          * happens at a time.
8516          */
8517         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8518
8519         /* Get a local copy of the last safe checkpoint record. */
8520         SpinLockAcquire(&xlogctl->info_lck);
8521         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
8522         lastCheckPoint = xlogctl->lastCheckPoint;
8523         SpinLockRelease(&xlogctl->info_lck);
8524
8525         /*
8526          * Check that we're still in recovery mode. It's ok if we exit recovery
8527          * mode after this check, the restart point is valid anyway.
8528          */
8529         if (!RecoveryInProgress())
8530         {
8531                 ereport(DEBUG2,
8532                           (errmsg("skipping restartpoint, recovery has already ended")));
8533                 LWLockRelease(CheckpointLock);
8534                 return false;
8535         }
8536
8537         /*
8538          * If the last checkpoint record we've replayed is already our last
8539          * restartpoint, we can't perform a new restart point. We still update
8540          * minRecoveryPoint in that case, so that if this is a shutdown restart
8541          * point, we won't start up earlier than before. That's not strictly
8542          * necessary, but when hot standby is enabled, it would be rather weird if
8543          * the database opened up for read-only connections at a point-in-time
8544          * before the last shutdown. Such time travel is still possible in case of
8545          * immediate shutdown, though.
8546          *
8547          * We don't explicitly advance minRecoveryPoint when we do create a
8548          * restartpoint. It's assumed that flushing the buffers will do that as a
8549          * side-effect.
8550          */
8551         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8552                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
8553         {
8554                 ereport(DEBUG2,
8555                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8556                                                 (uint32) (lastCheckPoint.redo >> 32),
8557                                                 (uint32) lastCheckPoint.redo)));
8558
8559                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8560                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8561                 {
8562                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8563                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8564                         ControlFile->time = (pg_time_t) time(NULL);
8565                         UpdateControlFile();
8566                         LWLockRelease(ControlFileLock);
8567                 }
8568                 LWLockRelease(CheckpointLock);
8569                 return false;
8570         }
8571
8572         /*
8573          * Update the shared RedoRecPtr so that the startup process can calculate
8574          * the number of segments replayed since last restartpoint, and request a
8575          * restartpoint if it exceeds checkpoint_segments.
8576          *
8577          * Like in CreateCheckPoint(), hold off insertions to update it, although
8578          * during recovery this is just pro forma, because no WAL insertions are
8579          * happening.
8580          */
8581         WALInsertSlotAcquire(true);
8582         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
8583         WALInsertSlotRelease();
8584
8585         /* Also update the info_lck-protected copy */
8586         SpinLockAcquire(&xlogctl->info_lck);
8587         xlogctl->RedoRecPtr = lastCheckPoint.redo;
8588         SpinLockRelease(&xlogctl->info_lck);
8589
8590         /*
8591          * Prepare to accumulate statistics.
8592          *
8593          * Note: because it is possible for log_checkpoints to change while a
8594          * checkpoint proceeds, we always accumulate stats, even if
8595          * log_checkpoints is currently off.
8596          */
8597         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8598         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8599
8600         if (log_checkpoints)
8601                 LogCheckpointStart(flags, true);
8602
8603         CheckPointGuts(lastCheckPoint.redo, flags);
8604
8605         /*
8606          * Select point at which we can truncate the xlog, which we base on the
8607          * prior checkpoint's earliest info.
8608          */
8609         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8610
8611         /*
8612          * Update pg_control, using current time.  Check that it still shows
8613          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8614          * this is a quick hack to make sure nothing really bad happens if somehow
8615          * we get here after the end-of-recovery checkpoint.
8616          */
8617         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8618         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8619                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
8620         {
8621                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8622                 ControlFile->checkPoint = lastCheckPointRecPtr;
8623                 ControlFile->checkPointCopy = lastCheckPoint;
8624                 ControlFile->time = (pg_time_t) time(NULL);
8625                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8626                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8627                 UpdateControlFile();
8628         }
8629         LWLockRelease(ControlFileLock);
8630
8631         /*
8632          * Due to an historical accident multixact truncations are not WAL-logged,
8633          * but just performed everytime the mxact horizon is increased. So, unless
8634          * we explicitly execute truncations on a standby it will never clean out
8635          * /pg_multixact which obviously is bad, both because it uses space and
8636          * because we can wrap around into pre-existing data...
8637          *
8638          * We can only do the truncation here, after the UpdateControlFile()
8639          * above, because we've now safely established a restart point, that
8640          * guarantees we will not need need to access those multis.
8641          *
8642          * It's probably worth improving this.
8643          */
8644         TruncateMultiXact(lastCheckPoint.oldestMulti);
8645
8646         /*
8647          * Delete old log files (those no longer needed even for previous
8648          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8649          * growing full.
8650          */
8651         if (_logSegNo)
8652         {
8653                 XLogRecPtr      receivePtr;
8654                 XLogRecPtr      replayPtr;
8655                 TimeLineID      replayTLI;
8656                 XLogRecPtr      endptr;
8657
8658                 /*
8659                  * Get the current end of xlog replayed or received, whichever is
8660                  * later.
8661                  */
8662                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
8663                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
8664                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
8665
8666                 KeepLogSeg(endptr, &_logSegNo);
8667                 _logSegNo--;
8668
8669                 /*
8670                  * Try to recycle segments on a useful timeline. If we've been promoted
8671                  * since the beginning of this restartpoint, use the new timeline
8672                  * chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
8673                  * in that case). If we're still in recovery, use the timeline we're
8674                  * currently replaying.
8675                  *
8676                  * There is no guarantee that the WAL segments will be useful on the
8677                  * current timeline; if recovery proceeds to a new timeline right
8678                  * after this, the pre-allocated WAL segments on this timeline will
8679                  * not be used, and will go wasted until recycled on the next
8680                  * restartpoint. We'll live with that.
8681                  */
8682                 if (RecoveryInProgress())
8683                         ThisTimeLineID = replayTLI;
8684
8685                 RemoveOldXlogFiles(_logSegNo, endptr);
8686
8687                 /*
8688                  * Make more log segments if needed.  (Do this after recycling old log
8689                  * segments, since that may supply some of the needed files.)
8690                  */
8691                 PreallocXlogFiles(endptr);
8692
8693                 /*
8694                  * ThisTimeLineID is normally not set when we're still in recovery.
8695                  * However, recycling/preallocating segments above needed
8696                  * ThisTimeLineID to determine which timeline to install the segments
8697                  * on. Reset it now, to restore the normal state of affairs for
8698                  * debugging purposes.
8699                  */
8700                 if (RecoveryInProgress())
8701                         ThisTimeLineID = 0;
8702         }
8703
8704         /*
8705          * Truncate pg_subtrans if possible.  We can throw away all data before
8706          * the oldest XMIN of any running transaction.  No future transaction will
8707          * attempt to reference any pg_subtrans entry older than that (see Asserts
8708          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
8709          * this because StartupSUBTRANS hasn't been called yet.
8710          */
8711         if (EnableHotStandby)
8712                 TruncateSUBTRANS(GetOldestXmin(true, false));
8713
8714         /* Real work is done, but log and update before releasing lock. */
8715         LogCheckpointEnd(true);
8716
8717         xtime = GetLatestXTime();
8718         ereport((log_checkpoints ? LOG : DEBUG2),
8719                         (errmsg("recovery restart point at %X/%X",
8720                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
8721                    xtime ? errdetail("last completed transaction was at log time %s",
8722                                                          timestamptz_to_str(xtime)) : 0));
8723
8724         LWLockRelease(CheckpointLock);
8725
8726         /*
8727          * Finally, execute archive_cleanup_command, if any.
8728          */
8729         if (XLogCtl->archiveCleanupCommand[0])
8730                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8731                                                            "archive_cleanup_command",
8732                                                            false);
8733
8734         return true;
8735 }
8736
8737 /*
8738  * Retreat *logSegNo to the last segment that we need to retain because of
8739  * wal_keep_segments. This is calculated by subtracting wal_keep_segments
8740  * from the given xlog location, recptr.
8741  */
8742 static void
8743 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
8744 {
8745         XLogSegNo       segno;
8746
8747         if (wal_keep_segments == 0)
8748                 return;
8749
8750         XLByteToSeg(recptr, segno);
8751
8752         /* avoid underflow, don't go below 1 */
8753         if (segno <= wal_keep_segments)
8754                 segno = 1;
8755         else
8756                 segno = segno - wal_keep_segments;
8757
8758         /* don't delete WAL segments newer than the calculated segment */
8759         if (segno < *logSegNo)
8760                 *logSegNo = segno;
8761 }
8762
8763 /*
8764  * Write a NEXTOID log record
8765  */
8766 void
8767 XLogPutNextOid(Oid nextOid)
8768 {
8769         XLogRecData rdata;
8770
8771         rdata.data = (char *) (&nextOid);
8772         rdata.len = sizeof(Oid);
8773         rdata.buffer = InvalidBuffer;
8774         rdata.next = NULL;
8775         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
8776
8777         /*
8778          * We need not flush the NEXTOID record immediately, because any of the
8779          * just-allocated OIDs could only reach disk as part of a tuple insert or
8780          * update that would have its own XLOG record that must follow the NEXTOID
8781          * record.      Therefore, the standard buffer LSN interlock applied to those
8782          * records will ensure no such OID reaches disk before the NEXTOID record
8783          * does.
8784          *
8785          * Note, however, that the above statement only covers state "within" the
8786          * database.  When we use a generated OID as a file or directory name, we
8787          * are in a sense violating the basic WAL rule, because that filesystem
8788          * change may reach disk before the NEXTOID WAL record does.  The impact
8789          * of this is that if a database crash occurs immediately afterward, we
8790          * might after restart re-generate the same OID and find that it conflicts
8791          * with the leftover file or directory.  But since for safety's sake we
8792          * always loop until finding a nonconflicting filename, this poses no real
8793          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8794          */
8795 }
8796
8797 /*
8798  * Write an XLOG SWITCH record.
8799  *
8800  * Here we just blindly issue an XLogInsert request for the record.
8801  * All the magic happens inside XLogInsert.
8802  *
8803  * The return value is either the end+1 address of the switch record,
8804  * or the end+1 address of the prior segment if we did not need to
8805  * write a switch record because we are already at segment start.
8806  */
8807 XLogRecPtr
8808 RequestXLogSwitch(void)
8809 {
8810         XLogRecPtr      RecPtr;
8811         XLogRecData rdata;
8812
8813         /* XLOG SWITCH, alone among xlog record types, has no data */
8814         rdata.buffer = InvalidBuffer;
8815         rdata.data = NULL;
8816         rdata.len = 0;
8817         rdata.next = NULL;
8818
8819         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
8820
8821         return RecPtr;
8822 }
8823
8824 /*
8825  * Write a RESTORE POINT record
8826  */
8827 XLogRecPtr
8828 XLogRestorePoint(const char *rpName)
8829 {
8830         XLogRecPtr      RecPtr;
8831         XLogRecData rdata;
8832         xl_restore_point xlrec;
8833
8834         xlrec.rp_time = GetCurrentTimestamp();
8835         strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8836
8837         rdata.buffer = InvalidBuffer;
8838         rdata.data = (char *) &xlrec;
8839         rdata.len = sizeof(xl_restore_point);
8840         rdata.next = NULL;
8841
8842         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
8843
8844         ereport(LOG,
8845                         (errmsg("restore point \"%s\" created at %X/%X",
8846                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
8847
8848         return RecPtr;
8849 }
8850
8851 /*
8852  * Write a backup block if needed when we are setting a hint. Note that
8853  * this may be called for a variety of page types, not just heaps.
8854  *
8855  * Callable while holding just share lock on the buffer content.
8856  *
8857  * We can't use the plain backup block mechanism since that relies on the
8858  * Buffer being exclusively locked. Since some modifications (setting LSN, hint
8859  * bits) are allowed in a sharelocked buffer that can lead to wal checksum
8860  * failures. So instead we copy the page and insert the copied data as normal
8861  * record data.
8862  *
8863  * We only need to do something if page has not yet been full page written in
8864  * this checkpoint round. The LSN of the inserted wal record is returned if we
8865  * had to write, InvalidXLogRecPtr otherwise.
8866  *
8867  * It is possible that multiple concurrent backends could attempt to write WAL
8868  * records. In that case, multiple copies of the same block would be recorded
8869  * in separate WAL records by different backends, though that is still OK from
8870  * a correctness perspective.
8871  */
8872 XLogRecPtr
8873 XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
8874 {
8875         XLogRecPtr      recptr = InvalidXLogRecPtr;
8876         XLogRecPtr      lsn;
8877         XLogRecData rdata[2];
8878         BkpBlock        bkpb;
8879
8880         /*
8881          * Ensure no checkpoint can change our view of RedoRecPtr.
8882          */
8883         Assert(MyPgXact->delayChkpt);
8884
8885         /*
8886          * Update RedoRecPtr so XLogCheckBuffer can make the right decision
8887          */
8888         GetRedoRecPtr();
8889
8890         /*
8891          * Setup phony rdata element for use within XLogCheckBuffer only. We reuse
8892          * and reset rdata for any actual WAL record insert.
8893          */
8894         rdata[0].buffer = buffer;
8895         rdata[0].buffer_std = buffer_std;
8896
8897         /*
8898          * Check buffer while not holding an exclusive lock.
8899          */
8900         if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
8901         {
8902                 char            copied_buffer[BLCKSZ];
8903                 char       *origdata = (char *) BufferGetBlock(buffer);
8904
8905                 /*
8906                  * Copy buffer so we don't have to worry about concurrent hint bit or
8907                  * lsn updates. We assume pd_lower/upper cannot be changed without an
8908                  * exclusive lock, so the contents bkp are not racy.
8909                  *
8910                  * With buffer_std set to false, XLogCheckBuffer() sets hole_length and
8911                  * hole_offset to 0; so the following code is safe for either case.
8912                  */
8913                 memcpy(copied_buffer, origdata, bkpb.hole_offset);
8914                 memcpy(copied_buffer + bkpb.hole_offset,
8915                            origdata + bkpb.hole_offset + bkpb.hole_length,
8916                            BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
8917
8918                 /*
8919                  * Header for backup block.
8920                  */
8921                 rdata[0].data = (char *) &bkpb;
8922                 rdata[0].len = sizeof(BkpBlock);
8923                 rdata[0].buffer = InvalidBuffer;
8924                 rdata[0].next = &(rdata[1]);
8925
8926                 /*
8927                  * Save copy of the buffer.
8928                  */
8929                 rdata[1].data = copied_buffer;
8930                 rdata[1].len = BLCKSZ - bkpb.hole_length;
8931                 rdata[1].buffer = InvalidBuffer;
8932                 rdata[1].next = NULL;
8933
8934                 recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata);
8935         }
8936
8937         return recptr;
8938 }
8939
8940 /*
8941  * Check if any of the GUC parameters that are critical for hot standby
8942  * have changed, and update the value in pg_control file if necessary.
8943  */
8944 static void
8945 XLogReportParameters(void)
8946 {
8947         if (wal_level != ControlFile->wal_level ||
8948                 MaxConnections != ControlFile->MaxConnections ||
8949                 max_worker_processes != ControlFile->max_worker_processes ||
8950                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
8951                 max_locks_per_xact != ControlFile->max_locks_per_xact)
8952         {
8953                 /*
8954                  * The change in number of backend slots doesn't need to be WAL-logged
8955                  * if archiving is not enabled, as you can't start archive recovery
8956                  * with wal_level=minimal anyway. We don't really care about the
8957                  * values in pg_control either if wal_level=minimal, but seems better
8958                  * to keep them up-to-date to avoid confusion.
8959                  */
8960                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8961                 {
8962                         XLogRecData rdata;
8963                         xl_parameter_change xlrec;
8964
8965                         xlrec.MaxConnections = MaxConnections;
8966                         xlrec.max_worker_processes = max_worker_processes;
8967                         xlrec.max_prepared_xacts = max_prepared_xacts;
8968                         xlrec.max_locks_per_xact = max_locks_per_xact;
8969                         xlrec.wal_level = wal_level;
8970
8971                         rdata.buffer = InvalidBuffer;
8972                         rdata.data = (char *) &xlrec;
8973                         rdata.len = sizeof(xlrec);
8974                         rdata.next = NULL;
8975
8976                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
8977                 }
8978
8979                 ControlFile->MaxConnections = MaxConnections;
8980                 ControlFile->max_worker_processes = max_worker_processes;
8981                 ControlFile->max_prepared_xacts = max_prepared_xacts;
8982                 ControlFile->max_locks_per_xact = max_locks_per_xact;
8983                 ControlFile->wal_level = wal_level;
8984                 UpdateControlFile();
8985         }
8986 }
8987
8988 /*
8989  * Update full_page_writes in shared memory, and write an
8990  * XLOG_FPW_CHANGE record if necessary.
8991  *
8992  * Note: this function assumes there is no other process running
8993  * concurrently that could update it.
8994  */
8995 void
8996 UpdateFullPageWrites(void)
8997 {
8998         XLogCtlInsert *Insert = &XLogCtl->Insert;
8999
9000         /*
9001          * Do nothing if full_page_writes has not been changed.
9002          *
9003          * It's safe to check the shared full_page_writes without the lock,
9004          * because we assume that there is no concurrently running process which
9005          * can update it.
9006          */
9007         if (fullPageWrites == Insert->fullPageWrites)
9008                 return;
9009
9010         START_CRIT_SECTION();
9011
9012         /*
9013          * It's always safe to take full page images, even when not strictly
9014          * required, but not the other round. So if we're setting full_page_writes
9015          * to true, first set it true and then write the WAL record. If we're
9016          * setting it to false, first write the WAL record and then set the global
9017          * flag.
9018          */
9019         if (fullPageWrites)
9020         {
9021                 WALInsertSlotAcquire(true);
9022                 Insert->fullPageWrites = true;
9023                 WALInsertSlotRelease();
9024         }
9025
9026         /*
9027          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9028          * full_page_writes during archive recovery, if required.
9029          */
9030         if (XLogStandbyInfoActive() && !RecoveryInProgress())
9031         {
9032                 XLogRecData rdata;
9033
9034                 rdata.data = (char *) (&fullPageWrites);
9035                 rdata.len = sizeof(bool);
9036                 rdata.buffer = InvalidBuffer;
9037                 rdata.next = NULL;
9038
9039                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
9040         }
9041
9042         if (!fullPageWrites)
9043         {
9044                 WALInsertSlotAcquire(true);
9045                 Insert->fullPageWrites = false;
9046                 WALInsertSlotRelease();
9047         }
9048         END_CRIT_SECTION();
9049 }
9050
9051 /*
9052  * Check that it's OK to switch to new timeline during recovery.
9053  *
9054  * 'lsn' is the address of the shutdown checkpoint record we're about to
9055  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9056  */
9057 static void
9058 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9059 {
9060         /* Check that the record agrees on what the current (old) timeline is */
9061         if (prevTLI != ThisTimeLineID)
9062                 ereport(PANIC,
9063                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9064                                                 prevTLI, ThisTimeLineID)));
9065
9066         /*
9067          * The new timeline better be in the list of timelines we expect to see,
9068          * according to the timeline history. It should also not decrease.
9069          */
9070         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9071                 ereport(PANIC,
9072                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9073                                  newTLI, ThisTimeLineID)));
9074
9075         /*
9076          * If we have not yet reached min recovery point, and we're about to
9077          * switch to a timeline greater than the timeline of the min recovery
9078          * point: trouble. After switching to the new timeline, we could not
9079          * possibly visit the min recovery point on the correct timeline anymore.
9080          * This can happen if there is a newer timeline in the archive that
9081          * branched before the timeline the min recovery point is on, and you
9082          * attempt to do PITR to the new timeline.
9083          */
9084         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9085                 lsn < minRecoveryPoint &&
9086                 newTLI > minRecoveryPointTLI)
9087                 ereport(PANIC,
9088                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9089                                                 newTLI,
9090                                                 (uint32) (minRecoveryPoint >> 32),
9091                                                 (uint32) minRecoveryPoint,
9092                                                 minRecoveryPointTLI)));
9093
9094         /* Looks good */
9095 }
9096
9097 /*
9098  * XLOG resource manager's routines
9099  *
9100  * Definitions of info values are in include/catalog/pg_control.h, though
9101  * not all record types are related to control file updates.
9102  */
9103 void
9104 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
9105 {
9106         uint8           info = record->xl_info & ~XLR_INFO_MASK;
9107
9108         /* Backup blocks are not used by XLOG rmgr */
9109         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
9110
9111         if (info == XLOG_NEXTOID)
9112         {
9113                 Oid                     nextOid;
9114
9115                 /*
9116                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9117                  * and the recorded nextOid, but that fails if the OID counter wraps
9118                  * around.      Since no OID allocation should be happening during replay
9119                  * anyway, better to just believe the record exactly.  We still take
9120                  * OidGenLock while setting the variable, just in case.
9121                  */
9122                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9123                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9124                 ShmemVariableCache->nextOid = nextOid;
9125                 ShmemVariableCache->oidCount = 0;
9126                 LWLockRelease(OidGenLock);
9127         }
9128         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9129         {
9130                 CheckPoint      checkPoint;
9131
9132                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9133                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9134                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9135                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9136                 LWLockRelease(XidGenLock);
9137                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9138                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9139                 ShmemVariableCache->oidCount = 0;
9140                 LWLockRelease(OidGenLock);
9141                 MultiXactSetNextMXact(checkPoint.nextMulti,
9142                                                           checkPoint.nextMultiOffset);
9143                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9144                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
9145
9146                 /*
9147                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9148                  * record, the backup was canceled and the end-of-backup record will
9149                  * never arrive.
9150                  */
9151                 if (ArchiveRecoveryRequested &&
9152                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9153                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9154                         ereport(PANIC,
9155                         (errmsg("online backup was canceled, recovery cannot continue")));
9156
9157                 /*
9158                  * If we see a shutdown checkpoint, we know that nothing was running
9159                  * on the master at this point. So fake-up an empty running-xacts
9160                  * record and use that here and now. Recover additional standby state
9161                  * for prepared transactions.
9162                  */
9163                 if (standbyState >= STANDBY_INITIALIZED)
9164                 {
9165                         TransactionId *xids;
9166                         int                     nxids;
9167                         TransactionId oldestActiveXID;
9168                         TransactionId latestCompletedXid;
9169                         RunningTransactionsData running;
9170
9171                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9172
9173                         /*
9174                          * Construct a RunningTransactions snapshot representing a shut
9175                          * down server, with only prepared transactions still alive. We're
9176                          * never overflowed at this point because all subxids are listed
9177                          * with their parent prepared transactions.
9178                          */
9179                         running.xcnt = nxids;
9180                         running.subxcnt = 0;
9181                         running.subxid_overflow = false;
9182                         running.nextXid = checkPoint.nextXid;
9183                         running.oldestRunningXid = oldestActiveXID;
9184                         latestCompletedXid = checkPoint.nextXid;
9185                         TransactionIdRetreat(latestCompletedXid);
9186                         Assert(TransactionIdIsNormal(latestCompletedXid));
9187                         running.latestCompletedXid = latestCompletedXid;
9188                         running.xids = xids;
9189
9190                         ProcArrayApplyRecoveryInfo(&running);
9191
9192                         StandbyRecoverPreparedTransactions(true);
9193                 }
9194
9195                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9196                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9197                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9198
9199                 /* Update shared-memory copy of checkpoint XID/epoch */
9200                 {
9201                         /* use volatile pointer to prevent code rearrangement */
9202                         volatile XLogCtlData *xlogctl = XLogCtl;
9203
9204                         SpinLockAcquire(&xlogctl->info_lck);
9205                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9206                         xlogctl->ckptXid = checkPoint.nextXid;
9207                         SpinLockRelease(&xlogctl->info_lck);
9208                 }
9209
9210                 /*
9211                  * We should've already switched to the new TLI before replaying this
9212                  * record.
9213                  */
9214                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9215                         ereport(PANIC,
9216                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9217                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9218
9219                 RecoveryRestartPoint(&checkPoint);
9220         }
9221         else if (info == XLOG_CHECKPOINT_ONLINE)
9222         {
9223                 CheckPoint      checkPoint;
9224
9225                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9226                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9227                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9228                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9229                                                                   checkPoint.nextXid))
9230                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9231                 LWLockRelease(XidGenLock);
9232                 /* ... but still treat OID counter as exact */
9233                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9234                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9235                 ShmemVariableCache->oidCount = 0;
9236                 LWLockRelease(OidGenLock);
9237                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9238                                                                   checkPoint.nextMultiOffset);
9239                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9240                                                                   checkPoint.oldestXid))
9241                         SetTransactionIdLimit(checkPoint.oldestXid,
9242                                                                   checkPoint.oldestXidDB);
9243                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9244                                                            checkPoint.oldestMultiDB);
9245
9246                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9247                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9248                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9249
9250                 /* Update shared-memory copy of checkpoint XID/epoch */
9251                 {
9252                         /* use volatile pointer to prevent code rearrangement */
9253                         volatile XLogCtlData *xlogctl = XLogCtl;
9254
9255                         SpinLockAcquire(&xlogctl->info_lck);
9256                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9257                         xlogctl->ckptXid = checkPoint.nextXid;
9258                         SpinLockRelease(&xlogctl->info_lck);
9259                 }
9260
9261                 /* TLI should not change in an on-line checkpoint */
9262                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9263                         ereport(PANIC,
9264                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9265                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9266
9267                 RecoveryRestartPoint(&checkPoint);
9268         }
9269         else if (info == XLOG_END_OF_RECOVERY)
9270         {
9271                 xl_end_of_recovery xlrec;
9272
9273                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9274
9275                 /*
9276                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9277                  * but this case is rarer and harder to test, so the benefit doesn't
9278                  * outweigh the potential extra cost of maintenance.
9279                  */
9280
9281                 /*
9282                  * We should've already switched to the new TLI before replaying this
9283                  * record.
9284                  */
9285                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9286                         ereport(PANIC,
9287                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9288                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9289         }
9290         else if (info == XLOG_NOOP)
9291         {
9292                 /* nothing to do here */
9293         }
9294         else if (info == XLOG_SWITCH)
9295         {
9296                 /* nothing to do here */
9297         }
9298         else if (info == XLOG_RESTORE_POINT)
9299         {
9300                 /* nothing to do here */
9301         }
9302         else if (info == XLOG_FPI)
9303         {
9304                 char       *data;
9305                 BkpBlock        bkpb;
9306
9307                 /*
9308                  * Full-page image (FPI) records contain a backup block stored "inline"
9309                  * in the normal data since the locking when writing hint records isn't
9310                  * sufficient to use the normal backup block mechanism, which assumes
9311                  * exclusive lock on the buffer supplied.
9312                  *
9313                  * Since the only change in these backup block are hint bits, there
9314                  * are no recovery conflicts generated.
9315                  *
9316                  * This also means there is no corresponding API call for this, so an
9317                  * smgr implementation has no need to implement anything. Which means
9318                  * nothing is needed in md.c etc
9319                  */
9320                 data = XLogRecGetData(record);
9321                 memcpy(&bkpb, data, sizeof(BkpBlock));
9322                 data += sizeof(BkpBlock);
9323
9324                 RestoreBackupBlockContents(lsn, bkpb, data, false, false);
9325         }
9326         else if (info == XLOG_BACKUP_END)
9327         {
9328                 XLogRecPtr      startpoint;
9329
9330                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9331
9332                 if (ControlFile->backupStartPoint == startpoint)
9333                 {
9334                         /*
9335                          * We have reached the end of base backup, the point where
9336                          * pg_stop_backup() was done. The data on disk is now consistent.
9337                          * Reset backupStartPoint, and update minRecoveryPoint to make
9338                          * sure we don't allow starting up at an earlier point even if
9339                          * recovery is stopped and restarted soon after this.
9340                          */
9341                         elog(DEBUG1, "end of backup reached");
9342
9343                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9344
9345                         if (ControlFile->minRecoveryPoint < lsn)
9346                         {
9347                                 ControlFile->minRecoveryPoint = lsn;
9348                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9349                         }
9350                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
9351                         ControlFile->backupEndRequired = false;
9352                         UpdateControlFile();
9353
9354                         LWLockRelease(ControlFileLock);
9355                 }
9356         }
9357         else if (info == XLOG_PARAMETER_CHANGE)
9358         {
9359                 xl_parameter_change xlrec;
9360
9361                 /* Update our copy of the parameters in pg_control */
9362                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9363
9364                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9365                 ControlFile->MaxConnections = xlrec.MaxConnections;
9366                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9367                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9368                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9369                 ControlFile->wal_level = xlrec.wal_level;
9370
9371                 /*
9372                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
9373                  * recover back up to this point before allowing hot standby again.
9374                  * This is particularly important if wal_level was set to 'archive'
9375                  * before, and is now 'hot_standby', to ensure you don't run queries
9376                  * against the WAL preceding the wal_level change. Same applies to
9377                  * decreasing max_* settings.
9378                  */
9379                 minRecoveryPoint = ControlFile->minRecoveryPoint;
9380                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9381                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9382                 {
9383                         ControlFile->minRecoveryPoint = lsn;
9384                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9385                 }
9386
9387                 UpdateControlFile();
9388                 LWLockRelease(ControlFileLock);
9389
9390                 /* Check to see if any changes to max_connections give problems */
9391                 CheckRequiredParameterValues();
9392         }
9393         else if (info == XLOG_FPW_CHANGE)
9394         {
9395                 /* use volatile pointer to prevent code rearrangement */
9396                 volatile XLogCtlData *xlogctl = XLogCtl;
9397                 bool            fpw;
9398
9399                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9400
9401                 /*
9402                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9403                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
9404                  * full_page_writes has been disabled during online backup.
9405                  */
9406                 if (!fpw)
9407                 {
9408                         SpinLockAcquire(&xlogctl->info_lck);
9409                         if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
9410                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
9411                         SpinLockRelease(&xlogctl->info_lck);
9412                 }
9413
9414                 /* Keep track of full_page_writes */
9415                 lastFullPageWrites = fpw;
9416         }
9417 }
9418
9419 #ifdef WAL_DEBUG
9420
9421 static void
9422 xlog_outrec(StringInfo buf, XLogRecord *record)
9423 {
9424         int                     i;
9425
9426         appendStringInfo(buf, "prev %X/%X; xid %u",
9427                                          (uint32) (record->xl_prev >> 32),
9428                                          (uint32) record->xl_prev,
9429                                          record->xl_xid);
9430
9431         appendStringInfo(buf, "; len %u",
9432                                          record->xl_len);
9433
9434         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
9435         {
9436                 if (record->xl_info & XLR_BKP_BLOCK(i))
9437                         appendStringInfo(buf, "; bkpb%d", i);
9438         }
9439
9440         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
9441 }
9442 #endif   /* WAL_DEBUG */
9443
9444
9445 /*
9446  * Return the (possible) sync flag used for opening a file, depending on the
9447  * value of the GUC wal_sync_method.
9448  */
9449 static int
9450 get_sync_bit(int method)
9451 {
9452         int                     o_direct_flag = 0;
9453
9454         /* If fsync is disabled, never open in sync mode */
9455         if (!enableFsync)
9456                 return 0;
9457
9458         /*
9459          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9460          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9461          * disabled, otherwise the archive command or walsender process will read
9462          * the WAL soon after writing it, which is guaranteed to cause a physical
9463          * read if we bypassed the kernel cache. We also skip the
9464          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9465          * reason.
9466          *
9467          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9468          * written by walreceiver is normally read by the startup process soon
9469          * after its written. Also, walreceiver performs unaligned writes, which
9470          * don't work with O_DIRECT, so it is required for correctness too.
9471          */
9472         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9473                 o_direct_flag = PG_O_DIRECT;
9474
9475         switch (method)
9476         {
9477                         /*
9478                          * enum values for all sync options are defined even if they are
9479                          * not supported on the current platform.  But if not, they are
9480                          * not included in the enum option array, and therefore will never
9481                          * be seen here.
9482                          */
9483                 case SYNC_METHOD_FSYNC:
9484                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9485                 case SYNC_METHOD_FDATASYNC:
9486                         return 0;
9487 #ifdef OPEN_SYNC_FLAG
9488                 case SYNC_METHOD_OPEN:
9489                         return OPEN_SYNC_FLAG | o_direct_flag;
9490 #endif
9491 #ifdef OPEN_DATASYNC_FLAG
9492                 case SYNC_METHOD_OPEN_DSYNC:
9493                         return OPEN_DATASYNC_FLAG | o_direct_flag;
9494 #endif
9495                 default:
9496                         /* can't happen (unless we are out of sync with option array) */
9497                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
9498                         return 0;                       /* silence warning */
9499         }
9500 }
9501
9502 /*
9503  * GUC support
9504  */
9505 void
9506 assign_xlog_sync_method(int new_sync_method, void *extra)
9507 {
9508         if (sync_method != new_sync_method)
9509         {
9510                 /*
9511                  * To ensure that no blocks escape unsynced, force an fsync on the
9512                  * currently open log segment (if any).  Also, if the open flag is
9513                  * changing, close the log file so it will be reopened (with new flag
9514                  * bit) at next use.
9515                  */
9516                 if (openLogFile >= 0)
9517                 {
9518                         if (pg_fsync(openLogFile) != 0)
9519                                 ereport(PANIC,
9520                                                 (errcode_for_file_access(),
9521                                                  errmsg("could not fsync log segment %s: %m",
9522                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
9523                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9524                                 XLogFileClose();
9525                 }
9526         }
9527 }
9528
9529
9530 /*
9531  * Issue appropriate kind of fsync (if any) for an XLOG output file.
9532  *
9533  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9534  * 'log' and 'seg' are for error reporting purposes.
9535  */
9536 void
9537 issue_xlog_fsync(int fd, XLogSegNo segno)
9538 {
9539         switch (sync_method)
9540         {
9541                 case SYNC_METHOD_FSYNC:
9542                         if (pg_fsync_no_writethrough(fd) != 0)
9543                                 ereport(PANIC,
9544                                                 (errcode_for_file_access(),
9545                                                  errmsg("could not fsync log file %s: %m",
9546                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9547                         break;
9548 #ifdef HAVE_FSYNC_WRITETHROUGH
9549                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9550                         if (pg_fsync_writethrough(fd) != 0)
9551                                 ereport(PANIC,
9552                                                 (errcode_for_file_access(),
9553                                           errmsg("could not fsync write-through log file %s: %m",
9554                                                          XLogFileNameP(ThisTimeLineID, segno))));
9555                         break;
9556 #endif
9557 #ifdef HAVE_FDATASYNC
9558                 case SYNC_METHOD_FDATASYNC:
9559                         if (pg_fdatasync(fd) != 0)
9560                                 ereport(PANIC,
9561                                                 (errcode_for_file_access(),
9562                                                  errmsg("could not fdatasync log file %s: %m",
9563                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9564                         break;
9565 #endif
9566                 case SYNC_METHOD_OPEN:
9567                 case SYNC_METHOD_OPEN_DSYNC:
9568                         /* write synced it already */
9569                         break;
9570                 default:
9571                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9572                         break;
9573         }
9574 }
9575
9576 /*
9577  * Return the filename of given log segment, as a palloc'd string.
9578  */
9579 char *
9580 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
9581 {
9582         char       *result = palloc(MAXFNAMELEN);
9583
9584         XLogFileName(result, tli, segno);
9585         return result;
9586 }
9587
9588 /*
9589  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9590  * function. It creates the necessary starting checkpoint and constructs the
9591  * backup label file.
9592  *
9593  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9594  * backup is started with pg_start_backup(), and there can be only one active
9595  * at a time. The backup label file of an exclusive backup is written to
9596  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9597  *
9598  * A non-exclusive backup is used for the streaming base backups (see
9599  * src/backend/replication/basebackup.c). The difference to exclusive backups
9600  * is that the backup label file is not written to disk. Instead, its would-be
9601  * contents are returned in *labelfile, and the caller is responsible for
9602  * including it in the backup archive as 'backup_label'. There can be many
9603  * non-exclusive backups active at the same time, and they don't conflict
9604  * with an exclusive backup either.
9605  *
9606  * Returns the minimum WAL position that must be present to restore from this
9607  * backup, and the corresponding timeline ID in *starttli_p.
9608  *
9609  * Every successfully started non-exclusive backup must be stopped by calling
9610  * do_pg_stop_backup() or do_pg_abort_backup().
9611  */
9612 XLogRecPtr
9613 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
9614                                    char **labelfile)
9615 {
9616         bool            exclusive = (labelfile == NULL);
9617         bool            backup_started_in_recovery = false;
9618         XLogRecPtr      checkpointloc;
9619         XLogRecPtr      startpoint;
9620         TimeLineID      starttli;
9621         pg_time_t       stamp_time;
9622         char            strfbuf[128];
9623         char            xlogfilename[MAXFNAMELEN];
9624         XLogSegNo       _logSegNo;
9625         struct stat stat_buf;
9626         FILE       *fp;
9627         StringInfoData labelfbuf;
9628
9629         backup_started_in_recovery = RecoveryInProgress();
9630
9631         if (!superuser() && !has_rolreplication(GetUserId()))
9632                 ereport(ERROR,
9633                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
9634                    errmsg("must be superuser or replication role to run a backup")));
9635
9636         /*
9637          * Currently only non-exclusive backup can be taken during recovery.
9638          */
9639         if (backup_started_in_recovery && exclusive)
9640                 ereport(ERROR,
9641                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9642                                  errmsg("recovery is in progress"),
9643                                  errhint("WAL control functions cannot be executed during recovery.")));
9644
9645         /*
9646          * During recovery, we don't need to check WAL level. Because, if WAL
9647          * level is not sufficient, it's impossible to get here during recovery.
9648          */
9649         if (!backup_started_in_recovery && !XLogIsNeeded())
9650                 ereport(ERROR,
9651                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9652                           errmsg("WAL level not sufficient for making an online backup"),
9653                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
9654
9655         if (strlen(backupidstr) > MAXPGPATH)
9656                 ereport(ERROR,
9657                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9658                                  errmsg("backup label too long (max %d bytes)",
9659                                                 MAXPGPATH)));
9660
9661         /*
9662          * Mark backup active in shared memory.  We must do full-page WAL writes
9663          * during an on-line backup even if not doing so at other times, because
9664          * it's quite possible for the backup dump to obtain a "torn" (partially
9665          * written) copy of a database page if it reads the page concurrently with
9666          * our write to the same page.  This can be fixed as long as the first
9667          * write to the page in the WAL sequence is a full-page write. Hence, we
9668          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9669          * are no dirty pages in shared memory that might get dumped while the
9670          * backup is in progress without having a corresponding WAL record.  (Once
9671          * the backup is complete, we need not force full-page writes anymore,
9672          * since we expect that any pages not modified during the backup interval
9673          * must have been correctly captured by the backup.)
9674          *
9675          * Note that forcePageWrites has no effect during an online backup from
9676          * the standby.
9677          *
9678          * We must hold all the insertion slots to change the value of
9679          * forcePageWrites, to ensure adequate interlocking against XLogInsert().
9680          */
9681         WALInsertSlotAcquire(true);
9682         if (exclusive)
9683         {
9684                 if (XLogCtl->Insert.exclusiveBackup)
9685                 {
9686                         WALInsertSlotRelease();
9687                         ereport(ERROR,
9688                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9689                                          errmsg("a backup is already in progress"),
9690                                          errhint("Run pg_stop_backup() and try again.")));
9691                 }
9692                 XLogCtl->Insert.exclusiveBackup = true;
9693         }
9694         else
9695                 XLogCtl->Insert.nonExclusiveBackups++;
9696         XLogCtl->Insert.forcePageWrites = true;
9697         WALInsertSlotRelease();
9698
9699         /* Ensure we release forcePageWrites if fail below */
9700         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9701         {
9702                 bool            gotUniqueStartpoint = false;
9703
9704                 /*
9705                  * Force an XLOG file switch before the checkpoint, to ensure that the
9706                  * WAL segment the checkpoint is written to doesn't contain pages with
9707                  * old timeline IDs.  That would otherwise happen if you called
9708                  * pg_start_backup() right after restoring from a PITR archive: the
9709                  * first WAL segment containing the startup checkpoint has pages in
9710                  * the beginning with the old timeline ID.      That can cause trouble at
9711                  * recovery: we won't have a history file covering the old timeline if
9712                  * pg_xlog directory was not included in the base backup and the WAL
9713                  * archive was cleared too before starting the backup.
9714                  *
9715                  * This also ensures that we have emitted a WAL page header that has
9716                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9717                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
9718                  * compress out removable backup blocks, it won't remove any that
9719                  * occur after this point.
9720                  *
9721                  * During recovery, we skip forcing XLOG file switch, which means that
9722                  * the backup taken during recovery is not available for the special
9723                  * recovery case described above.
9724                  */
9725                 if (!backup_started_in_recovery)
9726                         RequestXLogSwitch();
9727
9728                 do
9729                 {
9730                         bool            checkpointfpw;
9731
9732                         /*
9733                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
9734                          * page problems, this guarantees that two successive backup runs
9735                          * will have different checkpoint positions and hence different
9736                          * history file names, even if nothing happened in between.
9737                          *
9738                          * During recovery, establish a restartpoint if possible. We use
9739                          * the last restartpoint as the backup starting checkpoint. This
9740                          * means that two successive backup runs can have same checkpoint
9741                          * positions.
9742                          *
9743                          * Since the fact that we are executing do_pg_start_backup()
9744                          * during recovery means that checkpointer is running, we can use
9745                          * RequestCheckpoint() to establish a restartpoint.
9746                          *
9747                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
9748                          * passing fast = true).  Otherwise this can take awhile.
9749                          */
9750                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9751                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
9752
9753                         /*
9754                          * Now we need to fetch the checkpoint record location, and also
9755                          * its REDO pointer.  The oldest point in WAL that would be needed
9756                          * to restore starting from the checkpoint is precisely the REDO
9757                          * pointer.
9758                          */
9759                         LWLockAcquire(ControlFileLock, LW_SHARED);
9760                         checkpointloc = ControlFile->checkPoint;
9761                         startpoint = ControlFile->checkPointCopy.redo;
9762                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
9763                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9764                         LWLockRelease(ControlFileLock);
9765
9766                         if (backup_started_in_recovery)
9767                         {
9768                                 /* use volatile pointer to prevent code rearrangement */
9769                                 volatile XLogCtlData *xlogctl = XLogCtl;
9770                                 XLogRecPtr      recptr;
9771
9772                                 /*
9773                                  * Check to see if all WAL replayed during online backup
9774                                  * (i.e., since last restartpoint used as backup starting
9775                                  * checkpoint) contain full-page writes.
9776                                  */
9777                                 SpinLockAcquire(&xlogctl->info_lck);
9778                                 recptr = xlogctl->lastFpwDisableRecPtr;
9779                                 SpinLockRelease(&xlogctl->info_lck);
9780
9781                                 if (!checkpointfpw || startpoint <= recptr)
9782                                         ereport(ERROR,
9783                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9784                                                    errmsg("WAL generated with full_page_writes=off was replayed "
9785                                                                   "since last restartpoint"),
9786                                                    errhint("This means that the backup being taken on the standby "
9787                                                                    "is corrupt and should not be used. "
9788                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
9789                                                                    "and then try an online backup again.")));
9790
9791                                 /*
9792                                  * During recovery, since we don't use the end-of-backup WAL
9793                                  * record and don't write the backup history file, the
9794                                  * starting WAL location doesn't need to be unique. This means
9795                                  * that two base backups started at the same time might use
9796                                  * the same checkpoint as starting locations.
9797                                  */
9798                                 gotUniqueStartpoint = true;
9799                         }
9800
9801                         /*
9802                          * If two base backups are started at the same time (in WAL sender
9803                          * processes), we need to make sure that they use different
9804                          * checkpoints as starting locations, because we use the starting
9805                          * WAL location as a unique identifier for the base backup in the
9806                          * end-of-backup WAL record and when we write the backup history
9807                          * file. Perhaps it would be better generate a separate unique ID
9808                          * for each backup instead of forcing another checkpoint, but
9809                          * taking a checkpoint right after another is not that expensive
9810                          * either because only few buffers have been dirtied yet.
9811                          */
9812                         WALInsertSlotAcquire(true);
9813                         if (XLogCtl->Insert.lastBackupStart < startpoint)
9814                         {
9815                                 XLogCtl->Insert.lastBackupStart = startpoint;
9816                                 gotUniqueStartpoint = true;
9817                         }
9818                         WALInsertSlotRelease();
9819                 } while (!gotUniqueStartpoint);
9820
9821                 XLByteToSeg(startpoint, _logSegNo);
9822                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
9823
9824                 /*
9825                  * Construct backup label file
9826                  */
9827                 initStringInfo(&labelfbuf);
9828
9829                 /* Use the log timezone here, not the session timezone */
9830                 stamp_time = (pg_time_t) time(NULL);
9831                 pg_strftime(strfbuf, sizeof(strfbuf),
9832                                         "%Y-%m-%d %H:%M:%S %Z",
9833                                         pg_localtime(&stamp_time, log_timezone));
9834                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
9835                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
9836                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
9837                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
9838                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
9839                                                  exclusive ? "pg_start_backup" : "streamed");
9840                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
9841                                                  backup_started_in_recovery ? "standby" : "master");
9842                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
9843                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
9844
9845                 /*
9846                  * Okay, write the file, or return its contents to caller.
9847                  */
9848                 if (exclusive)
9849                 {
9850                         /*
9851                          * Check for existing backup label --- implies a backup is already
9852                          * running.  (XXX given that we checked exclusiveBackup above,
9853                          * maybe it would be OK to just unlink any such label file?)
9854                          */
9855                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
9856                         {
9857                                 if (errno != ENOENT)
9858                                         ereport(ERROR,
9859                                                         (errcode_for_file_access(),
9860                                                          errmsg("could not stat file \"%s\": %m",
9861                                                                         BACKUP_LABEL_FILE)));
9862                         }
9863                         else
9864                                 ereport(ERROR,
9865                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9866                                                  errmsg("a backup is already in progress"),
9867                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
9868                                                                  BACKUP_LABEL_FILE)));
9869
9870                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
9871
9872                         if (!fp)
9873                                 ereport(ERROR,
9874                                                 (errcode_for_file_access(),
9875                                                  errmsg("could not create file \"%s\": %m",
9876                                                                 BACKUP_LABEL_FILE)));
9877                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
9878                                 fflush(fp) != 0 ||
9879                                 pg_fsync(fileno(fp)) != 0 ||
9880                                 ferror(fp) ||
9881                                 FreeFile(fp))
9882                                 ereport(ERROR,
9883                                                 (errcode_for_file_access(),
9884                                                  errmsg("could not write file \"%s\": %m",
9885                                                                 BACKUP_LABEL_FILE)));
9886                         pfree(labelfbuf.data);
9887                 }
9888                 else
9889                         *labelfile = labelfbuf.data;
9890         }
9891         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9892
9893         /*
9894          * We're done.  As a convenience, return the starting WAL location.
9895          */
9896         if (starttli_p)
9897                 *starttli_p = starttli;
9898         return startpoint;
9899 }
9900
9901 /* Error cleanup callback for pg_start_backup */
9902 static void
9903 pg_start_backup_callback(int code, Datum arg)
9904 {
9905         bool            exclusive = DatumGetBool(arg);
9906
9907         /* Update backup counters and forcePageWrites on failure */
9908         WALInsertSlotAcquire(true);
9909         if (exclusive)
9910         {
9911                 Assert(XLogCtl->Insert.exclusiveBackup);
9912                 XLogCtl->Insert.exclusiveBackup = false;
9913         }
9914         else
9915         {
9916                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9917                 XLogCtl->Insert.nonExclusiveBackups--;
9918         }
9919
9920         if (!XLogCtl->Insert.exclusiveBackup &&
9921                 XLogCtl->Insert.nonExclusiveBackups == 0)
9922         {
9923                 XLogCtl->Insert.forcePageWrites = false;
9924         }
9925         WALInsertSlotRelease();
9926 }
9927
9928 /*
9929  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
9930  * function.
9931
9932  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
9933  * the non-exclusive backup specified by 'labelfile'.
9934  *
9935  * Returns the last WAL position that must be present to restore from this
9936  * backup, and the corresponding timeline ID in *stoptli_p.
9937  */
9938 XLogRecPtr
9939 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
9940 {
9941         bool            exclusive = (labelfile == NULL);
9942         bool            backup_started_in_recovery = false;
9943         XLogRecPtr      startpoint;
9944         XLogRecPtr      stoppoint;
9945         TimeLineID      stoptli;
9946         XLogRecData rdata;
9947         pg_time_t       stamp_time;
9948         char            strfbuf[128];
9949         char            histfilepath[MAXPGPATH];
9950         char            startxlogfilename[MAXFNAMELEN];
9951         char            stopxlogfilename[MAXFNAMELEN];
9952         char            lastxlogfilename[MAXFNAMELEN];
9953         char            histfilename[MAXFNAMELEN];
9954         char            backupfrom[20];
9955         XLogSegNo       _logSegNo;
9956         FILE       *lfp;
9957         FILE       *fp;
9958         char            ch;
9959         int                     seconds_before_warning;
9960         int                     waits = 0;
9961         bool            reported_waiting = false;
9962         char       *remaining;
9963         char       *ptr;
9964         uint32          hi,
9965                                 lo;
9966
9967         backup_started_in_recovery = RecoveryInProgress();
9968
9969         if (!superuser() && !has_rolreplication(GetUserId()))
9970                 ereport(ERROR,
9971                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
9972                  (errmsg("must be superuser or replication role to run a backup"))));
9973
9974         /*
9975          * Currently only non-exclusive backup can be taken during recovery.
9976          */
9977         if (backup_started_in_recovery && exclusive)
9978                 ereport(ERROR,
9979                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9980                                  errmsg("recovery is in progress"),
9981                                  errhint("WAL control functions cannot be executed during recovery.")));
9982
9983         /*
9984          * During recovery, we don't need to check WAL level. Because, if WAL
9985          * level is not sufficient, it's impossible to get here during recovery.
9986          */
9987         if (!backup_started_in_recovery && !XLogIsNeeded())
9988                 ereport(ERROR,
9989                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9990                           errmsg("WAL level not sufficient for making an online backup"),
9991                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
9992
9993         /*
9994          * OK to update backup counters and forcePageWrites
9995          */
9996         WALInsertSlotAcquire(true);
9997         if (exclusive)
9998                 XLogCtl->Insert.exclusiveBackup = false;
9999         else
10000         {
10001                 /*
10002                  * The user-visible pg_start/stop_backup() functions that operate on
10003                  * exclusive backups can be called at any time, but for non-exclusive
10004                  * backups, it is expected that each do_pg_start_backup() call is
10005                  * matched by exactly one do_pg_stop_backup() call.
10006                  */
10007                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10008                 XLogCtl->Insert.nonExclusiveBackups--;
10009         }
10010
10011         if (!XLogCtl->Insert.exclusiveBackup &&
10012                 XLogCtl->Insert.nonExclusiveBackups == 0)
10013         {
10014                 XLogCtl->Insert.forcePageWrites = false;
10015         }
10016         WALInsertSlotRelease();
10017
10018         if (exclusive)
10019         {
10020                 /*
10021                  * Read the existing label file into memory.
10022                  */
10023                 struct stat statbuf;
10024                 int                     r;
10025
10026                 if (stat(BACKUP_LABEL_FILE, &statbuf))
10027                 {
10028                         if (errno != ENOENT)
10029                                 ereport(ERROR,
10030                                                 (errcode_for_file_access(),
10031                                                  errmsg("could not stat file \"%s\": %m",
10032                                                                 BACKUP_LABEL_FILE)));
10033                         ereport(ERROR,
10034                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10035                                          errmsg("a backup is not in progress")));
10036                 }
10037
10038                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10039                 if (!lfp)
10040                 {
10041                         ereport(ERROR,
10042                                         (errcode_for_file_access(),
10043                                          errmsg("could not read file \"%s\": %m",
10044                                                         BACKUP_LABEL_FILE)));
10045                 }
10046                 labelfile = palloc(statbuf.st_size + 1);
10047                 r = fread(labelfile, statbuf.st_size, 1, lfp);
10048                 labelfile[statbuf.st_size] = '\0';
10049
10050                 /*
10051                  * Close and remove the backup label file
10052                  */
10053                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
10054                         ereport(ERROR,
10055                                         (errcode_for_file_access(),
10056                                          errmsg("could not read file \"%s\": %m",
10057                                                         BACKUP_LABEL_FILE)));
10058                 if (unlink(BACKUP_LABEL_FILE) != 0)
10059                         ereport(ERROR,
10060                                         (errcode_for_file_access(),
10061                                          errmsg("could not remove file \"%s\": %m",
10062                                                         BACKUP_LABEL_FILE)));
10063         }
10064
10065         /*
10066          * Read and parse the START WAL LOCATION line (this code is pretty crude,
10067          * but we are not expecting any variability in the file format).
10068          */
10069         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10070                            &hi, &lo, startxlogfilename,
10071                            &ch) != 4 || ch != '\n')
10072                 ereport(ERROR,
10073                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10074                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10075         startpoint = ((uint64) hi) << 32 | lo;
10076         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
10077
10078         /*
10079          * Parse the BACKUP FROM line. If we are taking an online backup from the
10080          * standby, we confirm that the standby has not been promoted during the
10081          * backup.
10082          */
10083         ptr = strstr(remaining, "BACKUP FROM:");
10084         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10085                 ereport(ERROR,
10086                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10087                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10088         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10089                 ereport(ERROR,
10090                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10091                                  errmsg("the standby was promoted during online backup"),
10092                                  errhint("This means that the backup being taken is corrupt "
10093                                                  "and should not be used. "
10094                                                  "Try taking another online backup.")));
10095
10096         /*
10097          * During recovery, we don't write an end-of-backup record. We assume that
10098          * pg_control was backed up last and its minimum recovery point can be
10099          * available as the backup end location. Since we don't have an
10100          * end-of-backup record, we use the pg_control value to check whether
10101          * we've reached the end of backup when starting recovery from this
10102          * backup. We have no way of checking if pg_control wasn't backed up last
10103          * however.
10104          *
10105          * We don't force a switch to new WAL file and wait for all the required
10106          * files to be archived. This is okay if we use the backup to start the
10107          * standby. But, if it's for an archive recovery, to ensure all the
10108          * required files are available, a user should wait for them to be
10109          * archived, or include them into the backup.
10110          *
10111          * We return the current minimum recovery point as the backup end
10112          * location. Note that it can be greater than the exact backup end
10113          * location if the minimum recovery point is updated after the backup of
10114          * pg_control. This is harmless for current uses.
10115          *
10116          * XXX currently a backup history file is for informational and debug
10117          * purposes only. It's not essential for an online backup. Furthermore,
10118          * even if it's created, it will not be archived during recovery because
10119          * an archiver is not invoked. So it doesn't seem worthwhile to write a
10120          * backup history file during recovery.
10121          */
10122         if (backup_started_in_recovery)
10123         {
10124                 /* use volatile pointer to prevent code rearrangement */
10125                 volatile XLogCtlData *xlogctl = XLogCtl;
10126                 XLogRecPtr      recptr;
10127
10128                 /*
10129                  * Check to see if all WAL replayed during online backup contain
10130                  * full-page writes.
10131                  */
10132                 SpinLockAcquire(&xlogctl->info_lck);
10133                 recptr = xlogctl->lastFpwDisableRecPtr;
10134                 SpinLockRelease(&xlogctl->info_lck);
10135
10136                 if (startpoint <= recptr)
10137                         ereport(ERROR,
10138                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10139                            errmsg("WAL generated with full_page_writes=off was replayed "
10140                                           "during online backup"),
10141                          errhint("This means that the backup being taken on the standby "
10142                                          "is corrupt and should not be used. "
10143                                  "Enable full_page_writes and run CHECKPOINT on the master, "
10144                                          "and then try an online backup again.")));
10145
10146
10147                 LWLockAcquire(ControlFileLock, LW_SHARED);
10148                 stoppoint = ControlFile->minRecoveryPoint;
10149                 stoptli = ControlFile->minRecoveryPointTLI;
10150                 LWLockRelease(ControlFileLock);
10151
10152                 if (stoptli_p)
10153                         *stoptli_p = stoptli;
10154                 return stoppoint;
10155         }
10156
10157         /*
10158          * Write the backup-end xlog record
10159          */
10160         rdata.data = (char *) (&startpoint);
10161         rdata.len = sizeof(startpoint);
10162         rdata.buffer = InvalidBuffer;
10163         rdata.next = NULL;
10164         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
10165         stoptli = ThisTimeLineID;
10166
10167         /*
10168          * Force a switch to a new xlog segment file, so that the backup is valid
10169          * as soon as archiver moves out the current segment file.
10170          */
10171         RequestXLogSwitch();
10172
10173         XLByteToPrevSeg(stoppoint, _logSegNo);
10174         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10175
10176         /* Use the log timezone here, not the session timezone */
10177         stamp_time = (pg_time_t) time(NULL);
10178         pg_strftime(strfbuf, sizeof(strfbuf),
10179                                 "%Y-%m-%d %H:%M:%S %Z",
10180                                 pg_localtime(&stamp_time, log_timezone));
10181
10182         /*
10183          * Write the backup history file
10184          */
10185         XLByteToSeg(startpoint, _logSegNo);
10186         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10187                                                   (uint32) (startpoint % XLogSegSize));
10188         fp = AllocateFile(histfilepath, "w");
10189         if (!fp)
10190                 ereport(ERROR,
10191                                 (errcode_for_file_access(),
10192                                  errmsg("could not create file \"%s\": %m",
10193                                                 histfilepath)));
10194         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10195                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10196         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10197                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10198         /* transfer remaining lines from label to history file */
10199         fprintf(fp, "%s", remaining);
10200         fprintf(fp, "STOP TIME: %s\n", strfbuf);
10201         if (fflush(fp) || ferror(fp) || FreeFile(fp))
10202                 ereport(ERROR,
10203                                 (errcode_for_file_access(),
10204                                  errmsg("could not write file \"%s\": %m",
10205                                                 histfilepath)));
10206
10207         /*
10208          * Clean out any no-longer-needed history files.  As a side effect, this
10209          * will post a .ready file for the newly created history file, notifying
10210          * the archiver that history file may be archived immediately.
10211          */
10212         CleanupBackupHistory();
10213
10214         /*
10215          * If archiving is enabled, wait for all the required WAL files to be
10216          * archived before returning. If archiving isn't enabled, the required WAL
10217          * needs to be transported via streaming replication (hopefully with
10218          * wal_keep_segments set high enough), or some more exotic mechanism like
10219          * polling and copying files from pg_xlog with script. We have no
10220          * knowledge of those mechanisms, so it's up to the user to ensure that he
10221          * gets all the required WAL.
10222          *
10223          * We wait until both the last WAL file filled during backup and the
10224          * history file have been archived, and assume that the alphabetic sorting
10225          * property of the WAL files ensures any earlier WAL files are safely
10226          * archived as well.
10227          *
10228          * We wait forever, since archive_command is supposed to work and we
10229          * assume the admin wanted his backup to work completely. If you don't
10230          * wish to wait, you can set statement_timeout.  Also, some notices are
10231          * issued to clue in anyone who might be doing this interactively.
10232          */
10233         if (waitforarchive && XLogArchivingActive())
10234         {
10235                 XLByteToPrevSeg(stoppoint, _logSegNo);
10236                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10237
10238                 XLByteToSeg(startpoint, _logSegNo);
10239                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10240                                                           (uint32) (startpoint % XLogSegSize));
10241
10242                 seconds_before_warning = 60;
10243                 waits = 0;
10244
10245                 while (XLogArchiveIsBusy(lastxlogfilename) ||
10246                            XLogArchiveIsBusy(histfilename))
10247                 {
10248                         CHECK_FOR_INTERRUPTS();
10249
10250                         if (!reported_waiting && waits > 5)
10251                         {
10252                                 ereport(NOTICE,
10253                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10254                                 reported_waiting = true;
10255                         }
10256
10257                         pg_usleep(1000000L);
10258
10259                         if (++waits >= seconds_before_warning)
10260                         {
10261                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
10262                                 ereport(WARNING,
10263                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10264                                                                 waits),
10265                                                  errhint("Check that your archive_command is executing properly.  "
10266                                                                  "pg_stop_backup can be canceled safely, "
10267                                                                  "but the database backup will not be usable without all the WAL segments.")));
10268                         }
10269                 }
10270
10271                 ereport(NOTICE,
10272                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
10273         }
10274         else if (waitforarchive)
10275                 ereport(NOTICE,
10276                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10277
10278         /*
10279          * We're done.  As a convenience, return the ending WAL location.
10280          */
10281         if (stoptli_p)
10282                 *stoptli_p = stoptli;
10283         return stoppoint;
10284 }
10285
10286
10287 /*
10288  * do_pg_abort_backup: abort a running backup
10289  *
10290  * This does just the most basic steps of do_pg_stop_backup(), by taking the
10291  * system out of backup mode, thus making it a lot more safe to call from
10292  * an error handler.
10293  *
10294  * NB: This is only for aborting a non-exclusive backup that doesn't write
10295  * backup_label. A backup started with pg_stop_backup() needs to be finished
10296  * with pg_stop_backup().
10297  */
10298 void
10299 do_pg_abort_backup(void)
10300 {
10301         WALInsertSlotAcquire(true);
10302         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10303         XLogCtl->Insert.nonExclusiveBackups--;
10304
10305         if (!XLogCtl->Insert.exclusiveBackup &&
10306                 XLogCtl->Insert.nonExclusiveBackups == 0)
10307         {
10308                 XLogCtl->Insert.forcePageWrites = false;
10309         }
10310         WALInsertSlotRelease();
10311 }
10312
10313 /*
10314  * Get latest redo apply position.
10315  *
10316  * Exported to allow WALReceiver to read the pointer directly.
10317  */
10318 XLogRecPtr
10319 GetXLogReplayRecPtr(TimeLineID *replayTLI)
10320 {
10321         /* use volatile pointer to prevent code rearrangement */
10322         volatile XLogCtlData *xlogctl = XLogCtl;
10323         XLogRecPtr      recptr;
10324         TimeLineID      tli;
10325
10326         SpinLockAcquire(&xlogctl->info_lck);
10327         recptr = xlogctl->lastReplayedEndRecPtr;
10328         tli = xlogctl->lastReplayedTLI;
10329         SpinLockRelease(&xlogctl->info_lck);
10330
10331         if (replayTLI)
10332                 *replayTLI = tli;
10333         return recptr;
10334 }
10335
10336 /*
10337  * Get latest WAL insert pointer
10338  */
10339 XLogRecPtr
10340 GetXLogInsertRecPtr(void)
10341 {
10342         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
10343         uint64          current_bytepos;
10344
10345         SpinLockAcquire(&Insert->insertpos_lck);
10346         current_bytepos = Insert->CurrBytePos;
10347         SpinLockRelease(&Insert->insertpos_lck);
10348
10349         return XLogBytePosToRecPtr(current_bytepos);
10350 }
10351
10352 /*
10353  * Get latest WAL write pointer
10354  */
10355 XLogRecPtr
10356 GetXLogWriteRecPtr(void)
10357 {
10358         {
10359                 /* use volatile pointer to prevent code rearrangement */
10360                 volatile XLogCtlData *xlogctl = XLogCtl;
10361
10362                 SpinLockAcquire(&xlogctl->info_lck);
10363                 LogwrtResult = xlogctl->LogwrtResult;
10364                 SpinLockRelease(&xlogctl->info_lck);
10365         }
10366
10367         return LogwrtResult.Write;
10368 }
10369
10370 /*
10371  * Returns the redo pointer of the last checkpoint or restartpoint. This is
10372  * the oldest point in WAL that we still need, if we have to restart recovery.
10373  */
10374 void
10375 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
10376 {
10377         LWLockAcquire(ControlFileLock, LW_SHARED);
10378         *oldrecptr = ControlFile->checkPointCopy.redo;
10379         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
10380         LWLockRelease(ControlFileLock);
10381 }
10382
10383 /*
10384  * read_backup_label: check to see if a backup_label file is present
10385  *
10386  * If we see a backup_label during recovery, we assume that we are recovering
10387  * from a backup dump file, and we therefore roll forward from the checkpoint
10388  * identified by the label file, NOT what pg_control says.      This avoids the
10389  * problem that pg_control might have been archived one or more checkpoints
10390  * later than the start of the dump, and so if we rely on it as the start
10391  * point, we will fail to restore a consistent database state.
10392  *
10393  * Returns TRUE if a backup_label was found (and fills the checkpoint
10394  * location and its REDO location into *checkPointLoc and RedoStartLSN,
10395  * respectively); returns FALSE if not. If this backup_label came from a
10396  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
10397  * was created during recovery, *backupFromStandby is set to TRUE.
10398  */
10399 static bool
10400 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
10401                                   bool *backupFromStandby)
10402 {
10403         char            startxlogfilename[MAXFNAMELEN];
10404         TimeLineID      tli;
10405         FILE       *lfp;
10406         char            ch;
10407         char            backuptype[20];
10408         char            backupfrom[20];
10409         uint32          hi,
10410                                 lo;
10411
10412         *backupEndRequired = false;
10413         *backupFromStandby = false;
10414
10415         /*
10416          * See if label file is present
10417          */
10418         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10419         if (!lfp)
10420         {
10421                 if (errno != ENOENT)
10422                         ereport(FATAL,
10423                                         (errcode_for_file_access(),
10424                                          errmsg("could not read file \"%s\": %m",
10425                                                         BACKUP_LABEL_FILE)));
10426                 return false;                   /* it's not there, all is fine */
10427         }
10428
10429         /*
10430          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
10431          * is pretty crude, but we are not expecting any variability in the file
10432          * format).
10433          */
10434         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
10435                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
10436                 ereport(FATAL,
10437                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10438                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10439         RedoStartLSN = ((uint64) hi) << 32 | lo;
10440         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
10441                            &hi, &lo, &ch) != 3 || ch != '\n')
10442                 ereport(FATAL,
10443                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10444                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10445         *checkPointLoc = ((uint64) hi) << 32 | lo;
10446
10447         /*
10448          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
10449          * from an older backup anyway, but since the information on it is not
10450          * strictly required, don't error out if it's missing for some reason.
10451          */
10452         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
10453         {
10454                 if (strcmp(backuptype, "streamed") == 0)
10455                         *backupEndRequired = true;
10456         }
10457
10458         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
10459         {
10460                 if (strcmp(backupfrom, "standby") == 0)
10461                         *backupFromStandby = true;
10462         }
10463
10464         if (ferror(lfp) || FreeFile(lfp))
10465                 ereport(FATAL,
10466                                 (errcode_for_file_access(),
10467                                  errmsg("could not read file \"%s\": %m",
10468                                                 BACKUP_LABEL_FILE)));
10469
10470         return true;
10471 }
10472
10473 /*
10474  * Error context callback for errors occurring during rm_redo().
10475  */
10476 static void
10477 rm_redo_error_callback(void *arg)
10478 {
10479         XLogRecord *record = (XLogRecord *) arg;
10480         StringInfoData buf;
10481
10482         initStringInfo(&buf);
10483         RmgrTable[record->xl_rmid].rm_desc(&buf,
10484                                                                            record->xl_info,
10485                                                                            XLogRecGetData(record));
10486
10487         /* don't bother emitting empty description */
10488         if (buf.len > 0)
10489                 errcontext("xlog redo %s", buf.data);
10490
10491         pfree(buf.data);
10492 }
10493
10494 /*
10495  * BackupInProgress: check if online backup mode is active
10496  *
10497  * This is done by checking for existence of the "backup_label" file.
10498  */
10499 bool
10500 BackupInProgress(void)
10501 {
10502         struct stat stat_buf;
10503
10504         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
10505 }
10506
10507 /*
10508  * CancelBackup: rename the "backup_label" file to cancel backup mode
10509  *
10510  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
10511  * Note that this will render an online backup in progress useless.
10512  * To correctly finish an online backup, pg_stop_backup must be called.
10513  */
10514 void
10515 CancelBackup(void)
10516 {
10517         struct stat stat_buf;
10518
10519         /* if the file is not there, return */
10520         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
10521                 return;
10522
10523         /* remove leftover file from previously canceled backup if it exists */
10524         unlink(BACKUP_LABEL_OLD);
10525
10526         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
10527         {
10528                 ereport(LOG,
10529                                 (errmsg("online backup mode canceled"),
10530                                  errdetail("\"%s\" was renamed to \"%s\".",
10531                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10532         }
10533         else
10534         {
10535                 ereport(WARNING,
10536                                 (errcode_for_file_access(),
10537                                  errmsg("online backup mode was not canceled"),
10538                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
10539                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10540         }
10541 }
10542
10543 /*
10544  * Read the XLOG page containing RecPtr into readBuf (if not read already).
10545  * Returns number of bytes read, if the page is read successfully, or -1
10546  * in case of errors.  When errors occur, they are ereport'ed, but only
10547  * if they have not been previously reported.
10548  *
10549  * This is responsible for restoring files from archive as needed, as well
10550  * as for waiting for the requested WAL record to arrive in standby mode.
10551  *
10552  * 'emode' specifies the log level used for reporting "file not found" or
10553  * "end of WAL" situations in archive recovery, or in standby mode when a
10554  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10555  * false in those situations, on higher log levels the ereport() won't
10556  * return.
10557  *
10558  * In standby mode, if after a successful return of XLogPageRead() the
10559  * caller finds the record it's interested in to be broken, it should
10560  * ereport the error with the level determined by
10561  * emode_for_corrupt_record(), and then set lastSourceFailed
10562  * and call XLogPageRead() again with the same arguments. This lets
10563  * XLogPageRead() to try fetching the record from another source, or to
10564  * sleep and retry.
10565  */
10566 static int
10567 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
10568                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
10569 {
10570         XLogPageReadPrivate *private =
10571         (XLogPageReadPrivate *) xlogreader->private_data;
10572         int                     emode = private->emode;
10573         uint32          targetPageOff;
10574         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
10575
10576         XLByteToSeg(targetPagePtr, targetSegNo);
10577         targetPageOff = targetPagePtr % XLogSegSize;
10578
10579         /*
10580          * See if we need to switch to a new segment because the requested record
10581          * is not in the currently open one.
10582          */
10583         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
10584         {
10585                 /*
10586                  * Request a restartpoint if we've replayed too much xlog since the
10587                  * last one.
10588                  */
10589                 if (StandbyModeRequested && bgwriterLaunched)
10590                 {
10591                         if (XLogCheckpointNeeded(readSegNo))
10592                         {
10593                                 (void) GetRedoRecPtr();
10594                                 if (XLogCheckpointNeeded(readSegNo))
10595                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10596                         }
10597                 }
10598
10599                 close(readFile);
10600                 readFile = -1;
10601                 readSource = 0;
10602         }
10603
10604         XLByteToSeg(targetPagePtr, readSegNo);
10605
10606 retry:
10607         /* See if we need to retrieve more data */
10608         if (readFile < 0 ||
10609                 (readSource == XLOG_FROM_STREAM &&
10610                  receivedUpto < targetPagePtr + reqLen))
10611         {
10612                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
10613                                                                                  private->randAccess,
10614                                                                                  private->fetching_ckpt,
10615                                                                                  targetRecPtr))
10616                 {
10617                         if (readFile >= 0)
10618                                 close(readFile);
10619                         readFile = -1;
10620                         readLen = 0;
10621                         readSource = 0;
10622
10623                         return -1;
10624                 }
10625         }
10626
10627         /*
10628          * At this point, we have the right segment open and if we're streaming we
10629          * know the requested record is in it.
10630          */
10631         Assert(readFile != -1);
10632
10633         /*
10634          * If the current segment is being streamed from master, calculate how
10635          * much of the current page we have received already. We know the
10636          * requested record has been received, but this is for the benefit of
10637          * future calls, to allow quick exit at the top of this function.
10638          */
10639         if (readSource == XLOG_FROM_STREAM)
10640         {
10641                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
10642                         readLen = XLOG_BLCKSZ;
10643                 else
10644                         readLen = receivedUpto % XLogSegSize - targetPageOff;
10645         }
10646         else
10647                 readLen = XLOG_BLCKSZ;
10648
10649         /* Read the requested page */
10650         readOff = targetPageOff;
10651         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10652         {
10653                 char            fname[MAXFNAMELEN];
10654
10655                 XLogFileName(fname, curFileTLI, readSegNo);
10656                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10657                                 (errcode_for_file_access(),
10658                                  errmsg("could not seek in log segment %s to offset %u: %m",
10659                                                 fname, readOff)));
10660                 goto next_record_is_invalid;
10661         }
10662
10663         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10664         {
10665                 char            fname[MAXFNAMELEN];
10666
10667                 XLogFileName(fname, curFileTLI, readSegNo);
10668                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10669                                 (errcode_for_file_access(),
10670                                  errmsg("could not read from log segment %s, offset %u: %m",
10671                                                 fname, readOff)));
10672                 goto next_record_is_invalid;
10673         }
10674
10675         Assert(targetSegNo == readSegNo);
10676         Assert(targetPageOff == readOff);
10677         Assert(reqLen <= readLen);
10678
10679         *readTLI = curFileTLI;
10680         return readLen;
10681
10682 next_record_is_invalid:
10683         lastSourceFailed = true;
10684
10685         if (readFile >= 0)
10686                 close(readFile);
10687         readFile = -1;
10688         readLen = 0;
10689         readSource = 0;
10690
10691         /* In standby-mode, keep trying */
10692         if (StandbyMode)
10693                 goto retry;
10694         else
10695                 return -1;
10696 }
10697
10698 /*
10699  * Open the WAL segment containing WAL position 'RecPtr'.
10700  *
10701  * The segment can be fetched via restore_command, or via walreceiver having
10702  * streamed the record, or it can already be present in pg_xlog. Checking
10703  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
10704  * too, in case someone copies a new segment directly to pg_xlog. That is not
10705  * documented or recommended, though.
10706  *
10707  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
10708  * prepare to read WAL starting from RedoStartLSN after this.
10709  *
10710  * 'RecPtr' might not point to the beginning of the record we're interested
10711  * in, it might also point to the page or segment header. In that case,
10712  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
10713  * used to decide which timeline to stream the requested WAL from.
10714  *
10715  * If the the record is not immediately available, the function returns false
10716  * if we're not in standby mode. In standby mode, waits for it to become
10717  * available.
10718  *
10719  * When the requested record becomes available, the function opens the file
10720  * containing it (if not open already), and returns true. When end of standby
10721  * mode is triggered by the user, and there is no more WAL available, returns
10722  * false.
10723  */
10724 static bool
10725 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
10726                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
10727 {
10728         static pg_time_t last_fail_time = 0;
10729         pg_time_t       now;
10730
10731         /*-------
10732          * Standby mode is implemented by a state machine:
10733          *
10734          * 1. Read from archive (XLOG_FROM_ARCHIVE)
10735          * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
10736          * 3. Check trigger file
10737          * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
10738          * 5. Rescan timelines
10739          * 6. Sleep 5 seconds, and loop back to 1.
10740          *
10741          * Failure to read from the current source advances the state machine to
10742          * the next state. In addition, successfully reading a file from pg_xlog
10743          * moves the state machine from state 2 back to state 1 (we always prefer
10744          * files in the archive over files in pg_xlog).
10745          *
10746          * 'currentSource' indicates the current state. There are no currentSource
10747          * values for "check trigger", "rescan timelines", and "sleep" states,
10748          * those actions are taken when reading from the previous source fails, as
10749          * part of advancing to the next state.
10750          *-------
10751          */
10752         if (!InArchiveRecovery)
10753                 currentSource = XLOG_FROM_PG_XLOG;
10754         else if (currentSource == 0)
10755                 currentSource = XLOG_FROM_ARCHIVE;
10756
10757         for (;;)
10758         {
10759                 int                     oldSource = currentSource;
10760
10761                 /*
10762                  * First check if we failed to read from the current source, and
10763                  * advance the state machine if so. The failure to read might've
10764                  * happened outside this function, e.g when a CRC check fails on a
10765                  * record, or within this loop.
10766                  */
10767                 if (lastSourceFailed)
10768                 {
10769                         switch (currentSource)
10770                         {
10771                                 case XLOG_FROM_ARCHIVE:
10772                                         currentSource = XLOG_FROM_PG_XLOG;
10773                                         break;
10774
10775                                 case XLOG_FROM_PG_XLOG:
10776
10777                                         /*
10778                                          * Check to see if the trigger file exists. Note that we
10779                                          * do this only after failure, so when you create the
10780                                          * trigger file, we still finish replaying as much as we
10781                                          * can from archive and pg_xlog before failover.
10782                                          */
10783                                         if (StandbyMode && CheckForStandbyTrigger())
10784                                         {
10785                                                 ShutdownWalRcv();
10786                                                 return false;
10787                                         }
10788
10789                                         /*
10790                                          * Not in standby mode, and we've now tried the archive
10791                                          * and pg_xlog.
10792                                          */
10793                                         if (!StandbyMode)
10794                                                 return false;
10795
10796                                         /*
10797                                          * If primary_conninfo is set, launch walreceiver to try
10798                                          * to stream the missing WAL.
10799                                          *
10800                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
10801                                          * checkpoint location. In that case, we use RedoStartLSN
10802                                          * as the streaming start position instead of RecPtr, so
10803                                          * that when we later jump backwards to start redo at
10804                                          * RedoStartLSN, we will have the logs streamed already.
10805                                          */
10806                                         if (PrimaryConnInfo)
10807                                         {
10808                                                 XLogRecPtr      ptr;
10809                                                 TimeLineID      tli;
10810
10811                                                 if (fetching_ckpt)
10812                                                 {
10813                                                         ptr = RedoStartLSN;
10814                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
10815                                                 }
10816                                                 else
10817                                                 {
10818                                                         ptr = tliRecPtr;
10819                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
10820
10821                                                         if (curFileTLI > 0 && tli < curFileTLI)
10822                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
10823                                                                          (uint32) (ptr >> 32), (uint32) ptr,
10824                                                                          tli, curFileTLI);
10825                                                 }
10826                                                 curFileTLI = tli;
10827                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo);
10828                                                 receivedUpto = 0;
10829                                         }
10830
10831                                         /*
10832                                          * Move to XLOG_FROM_STREAM state in either case. We'll
10833                                          * get immediate failure if we didn't launch walreceiver,
10834                                          * and move on to the next state.
10835                                          */
10836                                         currentSource = XLOG_FROM_STREAM;
10837                                         break;
10838
10839                                 case XLOG_FROM_STREAM:
10840
10841                                         /*
10842                                          * Failure while streaming. Most likely, we got here
10843                                          * because streaming replication was terminated, or
10844                                          * promotion was triggered. But we also get here if we
10845                                          * find an invalid record in the WAL streamed from master,
10846                                          * in which case something is seriously wrong. There's
10847                                          * little chance that the problem will just go away, but
10848                                          * PANIC is not good for availability either, especially
10849                                          * in hot standby mode. So, we treat that the same as
10850                                          * disconnection, and retry from archive/pg_xlog again.
10851                                          * The WAL in the archive should be identical to what was
10852                                          * streamed, so it's unlikely that it helps, but one can
10853                                          * hope...
10854                                          */
10855
10856                                         /*
10857                                          * Before we leave XLOG_FROM_STREAM state, make sure that
10858                                          * walreceiver is not active, so that it won't overwrite
10859                                          * WAL that we restore from archive.
10860                                          */
10861                                         if (WalRcvStreaming())
10862                                                 ShutdownWalRcv();
10863
10864                                         /*
10865                                          * Before we sleep, re-scan for possible new timelines if
10866                                          * we were requested to recover to the latest timeline.
10867                                          */
10868                                         if (recoveryTargetIsLatest)
10869                                         {
10870                                                 if (rescanLatestTimeLine())
10871                                                 {
10872                                                         currentSource = XLOG_FROM_ARCHIVE;
10873                                                         break;
10874                                                 }
10875                                         }
10876
10877                                         /*
10878                                          * XLOG_FROM_STREAM is the last state in our state
10879                                          * machine, so we've exhausted all the options for
10880                                          * obtaining the requested WAL. We're going to loop back
10881                                          * and retry from the archive, but if it hasn't been long
10882                                          * since last attempt, sleep 5 seconds to avoid
10883                                          * busy-waiting.
10884                                          */
10885                                         now = (pg_time_t) time(NULL);
10886                                         if ((now - last_fail_time) < 5)
10887                                         {
10888                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
10889                                                 now = (pg_time_t) time(NULL);
10890                                         }
10891                                         last_fail_time = now;
10892                                         currentSource = XLOG_FROM_ARCHIVE;
10893                                         break;
10894
10895                                 default:
10896                                         elog(ERROR, "unexpected WAL source %d", currentSource);
10897                         }
10898                 }
10899                 else if (currentSource == XLOG_FROM_PG_XLOG)
10900                 {
10901                         /*
10902                          * We just successfully read a file in pg_xlog. We prefer files in
10903                          * the archive over ones in pg_xlog, so try the next file again
10904                          * from the archive first.
10905                          */
10906                         if (InArchiveRecovery)
10907                                 currentSource = XLOG_FROM_ARCHIVE;
10908                 }
10909
10910                 if (currentSource != oldSource)
10911                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
10912                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
10913                                  lastSourceFailed ? "failure" : "success");
10914
10915                 /*
10916                  * We've now handled possible failure. Try to read from the chosen
10917                  * source.
10918                  */
10919                 lastSourceFailed = false;
10920
10921                 switch (currentSource)
10922                 {
10923                         case XLOG_FROM_ARCHIVE:
10924                         case XLOG_FROM_PG_XLOG:
10925                                 /* Close any old file we might have open. */
10926                                 if (readFile >= 0)
10927                                 {
10928                                         close(readFile);
10929                                         readFile = -1;
10930                                 }
10931                                 /* Reset curFileTLI if random fetch. */
10932                                 if (randAccess)
10933                                         curFileTLI = 0;
10934
10935                                 /*
10936                                  * Try to restore the file from archive, or read an existing
10937                                  * file from pg_xlog.
10938                                  */
10939                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
10940                                 if (readFile >= 0)
10941                                         return true;    /* success! */
10942
10943                                 /*
10944                                  * Nope, not found in archive or pg_xlog.
10945                                  */
10946                                 lastSourceFailed = true;
10947                                 break;
10948
10949                         case XLOG_FROM_STREAM:
10950                                 {
10951                                         bool            havedata;
10952
10953                                         /*
10954                                          * Check if WAL receiver is still active.
10955                                          */
10956                                         if (!WalRcvStreaming())
10957                                         {
10958                                                 lastSourceFailed = true;
10959                                                 break;
10960                                         }
10961
10962                                         /*
10963                                          * Walreceiver is active, so see if new data has arrived.
10964                                          *
10965                                          * We only advance XLogReceiptTime when we obtain fresh
10966                                          * WAL from walreceiver and observe that we had already
10967                                          * processed everything before the most recent "chunk"
10968                                          * that it flushed to disk.  In steady state where we are
10969                                          * keeping up with the incoming data, XLogReceiptTime will
10970                                          * be updated on each cycle. When we are behind,
10971                                          * XLogReceiptTime will not advance, so the grace time
10972                                          * allotted to conflicting queries will decrease.
10973                                          */
10974                                         if (RecPtr < receivedUpto)
10975                                                 havedata = true;
10976                                         else
10977                                         {
10978                                                 XLogRecPtr      latestChunkStart;
10979
10980                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
10981                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
10982                                                 {
10983                                                         havedata = true;
10984                                                         if (latestChunkStart <= RecPtr)
10985                                                         {
10986                                                                 XLogReceiptTime = GetCurrentTimestamp();
10987                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
10988                                                         }
10989                                                 }
10990                                                 else
10991                                                         havedata = false;
10992                                         }
10993                                         if (havedata)
10994                                         {
10995                                                 /*
10996                                                  * Great, streamed far enough.  Open the file if it's
10997                                                  * not open already.  Also read the timeline history
10998                                                  * file if we haven't initialized timeline history
10999                                                  * yet; it should be streamed over and present in
11000                                                  * pg_xlog by now.      Use XLOG_FROM_STREAM so that
11001                                                  * source info is set correctly and XLogReceiptTime
11002                                                  * isn't changed.
11003                                                  */
11004                                                 if (readFile < 0)
11005                                                 {
11006                                                         if (!expectedTLEs)
11007                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
11008                                                         readFile = XLogFileRead(readSegNo, PANIC,
11009                                                                                                         receiveTLI,
11010                                                                                                         XLOG_FROM_STREAM, false);
11011                                                         Assert(readFile >= 0);
11012                                                 }
11013                                                 else
11014                                                 {
11015                                                         /* just make sure source info is correct... */
11016                                                         readSource = XLOG_FROM_STREAM;
11017                                                         XLogReceiptSource = XLOG_FROM_STREAM;
11018                                                         return true;
11019                                                 }
11020                                                 break;
11021                                         }
11022
11023                                         /*
11024                                          * Data not here yet. Check for trigger, then wait for
11025                                          * walreceiver to wake us up when new WAL arrives.
11026                                          */
11027                                         if (CheckForStandbyTrigger())
11028                                         {
11029                                                 /*
11030                                                  * Note that we don't "return false" immediately here.
11031                                                  * After being triggered, we still want to replay all
11032                                                  * the WAL that was already streamed. It's in pg_xlog
11033                                                  * now, so we just treat this as a failure, and the
11034                                                  * state machine will move on to replay the streamed
11035                                                  * WAL from pg_xlog, and then recheck the trigger and
11036                                                  * exit replay.
11037                                                  */
11038                                                 lastSourceFailed = true;
11039                                                 break;
11040                                         }
11041
11042                                         /*
11043                                          * Wait for more WAL to arrive. Time out after 5 seconds,
11044                                          * like when polling the archive, to react to a trigger
11045                                          * file promptly.
11046                                          */
11047                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
11048                                                           WL_LATCH_SET | WL_TIMEOUT,
11049                                                           5000L);
11050                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
11051                                         break;
11052                                 }
11053
11054                         default:
11055                                 elog(ERROR, "unexpected WAL source %d", currentSource);
11056                 }
11057
11058                 /*
11059                  * This possibly-long loop needs to handle interrupts of startup
11060                  * process.
11061                  */
11062                 HandleStartupProcInterrupts();
11063         } while (StandbyMode);
11064
11065         return false;
11066 }
11067
11068 /*
11069  * Determine what log level should be used to report a corrupt WAL record
11070  * in the current WAL page, previously read by XLogPageRead().
11071  *
11072  * 'emode' is the error mode that would be used to report a file-not-found
11073  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
11074  * we're retrying the exact same record that we've tried previously, only
11075  * complain the first time to keep the noise down.      However, we only do when
11076  * reading from pg_xlog, because we don't expect any invalid records in archive
11077  * or in records streamed from master. Files in the archive should be complete,
11078  * and we should never hit the end of WAL because we stop and wait for more WAL
11079  * to arrive before replaying it.
11080  *
11081  * NOTE: This function remembers the RecPtr value it was last called with,
11082  * to suppress repeated messages about the same record. Only call this when
11083  * you are about to ereport(), or you might cause a later message to be
11084  * erroneously suppressed.
11085  */
11086 static int
11087 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
11088 {
11089         static XLogRecPtr lastComplaint = 0;
11090
11091         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
11092         {
11093                 if (RecPtr == lastComplaint)
11094                         emode = DEBUG1;
11095                 else
11096                         lastComplaint = RecPtr;
11097         }
11098         return emode;
11099 }
11100
11101 /*
11102  * Check to see whether the user-specified trigger file exists and whether a
11103  * promote request has arrived.  If either condition holds, return true.
11104  */
11105 static bool
11106 CheckForStandbyTrigger(void)
11107 {
11108         struct stat stat_buf;
11109         static bool triggered = false;
11110
11111         if (triggered)
11112                 return true;
11113
11114         if (IsPromoteTriggered())
11115         {
11116                 /*
11117                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
11118                  * signal handler. It now leaves the file in place and lets the
11119                  * Startup process do the unlink. This allows Startup to know whether
11120                  * it should create a full checkpoint before starting up (fallback
11121                  * mode). Fast promotion takes precedence.
11122                  */
11123                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11124                 {
11125                         unlink(PROMOTE_SIGNAL_FILE);
11126                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11127                         fast_promote = true;
11128                 }
11129                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11130                 {
11131                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11132                         fast_promote = false;
11133                 }
11134
11135                 ereport(LOG, (errmsg("received promote request")));
11136
11137                 ResetPromoteTriggered();
11138                 triggered = true;
11139                 return true;
11140         }
11141
11142         if (TriggerFile == NULL)
11143                 return false;
11144
11145         if (stat(TriggerFile, &stat_buf) == 0)
11146         {
11147                 ereport(LOG,
11148                                 (errmsg("trigger file found: %s", TriggerFile)));
11149                 unlink(TriggerFile);
11150                 triggered = true;
11151                 fast_promote = true;
11152                 return true;
11153         }
11154         return false;
11155 }
11156
11157 /*
11158  * Check to see if a promote request has arrived. Should be
11159  * called by postmaster after receiving SIGUSR1.
11160  */
11161 bool
11162 CheckPromoteSignal(void)
11163 {
11164         struct stat stat_buf;
11165
11166         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
11167                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11168                 return true;
11169
11170         return false;
11171 }
11172
11173 /*
11174  * Wake up startup process to replay newly arrived WAL, or to notice that
11175  * failover has been requested.
11176  */
11177 void
11178 WakeupRecovery(void)
11179 {
11180         SetLatch(&XLogCtl->recoveryWakeupLatch);
11181 }
11182
11183 /*
11184  * Update the WalWriterSleeping flag.
11185  */
11186 void
11187 SetWalWriterSleeping(bool sleeping)
11188 {
11189         /* use volatile pointer to prevent code rearrangement */
11190         volatile XLogCtlData *xlogctl = XLogCtl;
11191
11192         SpinLockAcquire(&xlogctl->info_lck);
11193         xlogctl->WalWriterSleeping = sleeping;
11194         SpinLockRelease(&xlogctl->info_lck);
11195 }