granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <signal.h>
  19 #include <time.h>
  20 #include <fcntl.h>
  21 #include <sys/stat.h>
  22 #include <sys/time.h>
  23 #include <sys/wait.h>
  24 #include <unistd.h>
  25
  26 #include "access/clog.h"
  27 #include "access/multixact.h"
  28 #include "access/subtrans.h"
  29 #include "access/transam.h"
  30 #include "access/tuptoaster.h"
  31 #include "access/twophase.h"
  32 #include "access/xact.h"
  33 #include "access/xlog_internal.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_database.h"
  38 #include "libpq/pqsignal.h"
  39 #include "miscadmin.h"
  40 #include "pgstat.h"
  41 #include "postmaster/bgwriter.h"
  42 #include "postmaster/startup.h"
  43 #include "replication/walreceiver.h"
  44 #include "replication/walsender.h"
  45 #include "storage/bufmgr.h"
  46 #include "storage/fd.h"
  47 #include "storage/ipc.h"
  48 #include "storage/latch.h"
  49 #include "storage/pmsignal.h"
  50 #include "storage/predicate.h"
  51 #include "storage/proc.h"
  52 #include "storage/procarray.h"
  53 #include "storage/reinit.h"
  54 #include "storage/smgr.h"
  55 #include "storage/spin.h"
  56 #include "utils/builtins.h"
  57 #include "utils/guc.h"
  58 #include "utils/ps_status.h"
  59 #include "utils/relmapper.h"
  60 #include "utils/snapmgr.h"
  61 #include "utils/timestamp.h"
  62 #include "pg_trace.h"
  63
  64
  65 /* File path names (all relative to $PGDATA) */
  66 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  67 #define RECOVERY_COMMAND_DONE   "recovery.done"
  68 #define PROMOTE_SIGNAL_FILE "promote"
  69
  70
  71 /* User-settable parameters */
  72 int                     CheckPointSegments = 3;
  73 int                     wal_keep_segments = 0;
  74 int                     XLOGbuffers = -1;
  75 int                     XLogArchiveTimeout = 0;
  76 bool            XLogArchiveMode = false;
  77 char       *XLogArchiveCommand = NULL;
  78 bool            EnableHotStandby = false;
  79 bool            fullPageWrites = true;
  80 bool            log_checkpoints = false;
  81 int                     sync_method = DEFAULT_SYNC_METHOD;
  82 int                     wal_level = WAL_LEVEL_MINIMAL;
  83
  84 #ifdef WAL_DEBUG
  85 bool            XLOG_DEBUG = false;
  86 #endif
  87
  88 /*
  89  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  90  * When we are done with an old XLOG segment file, we will recycle it as a
  91  * future XLOG segment as long as there aren't already XLOGfileslop future
  92  * segments; else we'll delete it.  This could be made a separate GUC
  93  * variable, but at present I think it's sufficient to hardwire it as
  94  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  95  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  96  * of them; the +1 allows boundary cases to happen without wasting a
  97  * delete/create-segment cycle.
  98  */
  99 #define XLOGfileslop    (2*CheckPointSegments + 1)
 100
 101 /*
 102  * GUC support
 103  */
 104 const struct config_enum_entry wal_level_options[] = {
 105         {"minimal", WAL_LEVEL_MINIMAL, false},
 106         {"archive", WAL_LEVEL_ARCHIVE, false},
 107         {"hot_standby", WAL_LEVEL_HOT_STANDBY, false},
 108         {NULL, 0, false}
 109 };
 110
 111 const struct config_enum_entry sync_method_options[] = {
 112         {"fsync", SYNC_METHOD_FSYNC, false},
 113 #ifdef HAVE_FSYNC_WRITETHROUGH
 114         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 115 #endif
 116 #ifdef HAVE_FDATASYNC
 117         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 118 #endif
 119 #ifdef OPEN_SYNC_FLAG
 120         {"open_sync", SYNC_METHOD_OPEN, false},
 121 #endif
 122 #ifdef OPEN_DATASYNC_FLAG
 123         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 124 #endif
 125         {NULL, 0, false}
 126 };
 127
 128 /*
 129  * Statistics for current checkpoint are collected in this global struct.
 130  * Because only the background writer or a stand-alone backend can perform
 131  * checkpoints, this will be unused in normal backends.
 132  */
 133 CheckpointStatsData CheckpointStats;
 134
 135 /*
 136  * ThisTimeLineID will be same in all backends --- it identifies current
 137  * WAL timeline for the database system.
 138  */
 139 TimeLineID      ThisTimeLineID = 0;
 140
 141 /*
 142  * Are we doing recovery from XLOG?
 143  *
 144  * This is only ever true in the startup process; it should be read as meaning
 145  * "this process is replaying WAL records", rather than "the system is in
 146  * recovery mode".  It should be examined primarily by functions that need
 147  * to act differently when called from a WAL redo function (e.g., to skip WAL
 148  * logging).  To check whether the system is in recovery regardless of which
 149  * process you're running in, use RecoveryInProgress() but only after shared
 150  * memory startup and lock initialization.
 151  */
 152 bool            InRecovery = false;
 153
 154 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 155 HotStandbyState standbyState = STANDBY_DISABLED;
 156
 157 static XLogRecPtr LastRec;
 158
 159 /*
 160  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 161  * the replayed WAL records indicate. It's initialized with full_page_writes
 162  * that the recovery starting checkpoint record indicates, and then updated
 163  * each time XLOG_FPW_CHANGE record is replayed.
 164  */
 165 static bool lastFullPageWrites;
 166
 167 /*
 168  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 169  * known, need to check the shared state".
 170  */
 171 static bool LocalRecoveryInProgress = true;
 172
 173 /*
 174  * Local copy of SharedHotStandbyActive variable. False actually means "not
 175  * known, need to check the shared state".
 176  */
 177 static bool LocalHotStandbyActive = false;
 178
 179 /*
 180  * Local state for XLogInsertAllowed():
 181  *              1: unconditionally allowed to insert XLOG
 182  *              0: unconditionally not allowed to insert XLOG
 183  *              -1: must check RecoveryInProgress(); disallow until it is false
 184  * Most processes start with -1 and transition to 1 after seeing that recovery
 185  * is not in progress.  But we can also force the value for special cases.
 186  * The coding in XLogInsertAllowed() depends on the first two of these states
 187  * being numerically the same as bool true and false.
 188  */
 189 static int      LocalXLogInsertAllowed = -1;
 190
 191 /* Are we recovering using offline XLOG archives? */
 192 static bool InArchiveRecovery = false;
 193
 194 /* Was the last xlog file restored from archive, or local? */
 195 static bool restoredFromArchive = false;
 196
 197 /* options taken from recovery.conf for archive recovery */
 198 static char *recoveryRestoreCommand = NULL;
 199 static char *recoveryEndCommand = NULL;
 200 static char *archiveCleanupCommand = NULL;
 201 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 202 static bool recoveryTargetInclusive = true;
 203 static bool recoveryPauseAtTarget = true;
 204 static TransactionId recoveryTargetXid;
 205 static TimestampTz recoveryTargetTime;
 206 static char *recoveryTargetName;
 207
 208 /* options taken from recovery.conf for XLOG streaming */
 209 static bool StandbyMode = false;
 210 static char *PrimaryConnInfo = NULL;
 211 static char *TriggerFile = NULL;
 212
 213 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
 214 static TransactionId recoveryStopXid;
 215 static TimestampTz recoveryStopTime;
 216 static char recoveryStopName[MAXFNAMELEN];
 217 static bool recoveryStopAfter;
 218
 219 /*
 220  * During normal operation, the only timeline we care about is ThisTimeLineID.
 221  * During recovery, however, things are more complicated.  To simplify life
 222  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 223  * scan through the WAL history (that is, it is the line that was active when
 224  * the currently-scanned WAL record was generated).  We also need these
 225  * timeline values:
 226  *
 227  * recoveryTargetTLI: the desired timeline that we want to end in.
 228  *
 229  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 230  *
 231  * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 232  * its known parents, newest first (so recoveryTargetTLI is always the
 233  * first list member).  Only these TLIs are expected to be seen in the WAL
 234  * segments we read, and indeed only these TLIs will be considered as
 235  * candidate WAL files to open at all.
 236  *
 237  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 238  * (This is not necessarily the same as ThisTimeLineID, because we could
 239  * be scanning data that was copied from an ancestor timeline when the current
 240  * file was created.)  During a sequential scan we do not allow this value
 241  * to decrease.
 242  */
 243 static TimeLineID recoveryTargetTLI;
 244 static bool recoveryTargetIsLatest = false;
 245 static List *expectedTLIs;
 246 static TimeLineID curFileTLI;
 247
 248 /*
 249  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 250  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 251  * end+1 of the last record, and is reset when we end a top-level transaction,
 252  * or start a new one; so it can be used to tell if the current transaction has
 253  * created any XLOG records.
 254  */
 255 static XLogRecPtr ProcLastRecPtr = {0, 0};
 256
 257 XLogRecPtr      XactLastRecEnd = {0, 0};
 258
 259 /*
 260  * RedoRecPtr is this backend's local copy of the REDO record pointer
 261  * (which is almost but not quite the same as a pointer to the most recent
 262  * CHECKPOINT record).  We update this from the shared-memory copy,
 263  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 264  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 265  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 266  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 267  * InitXLOGAccess.
 268  */
 269 static XLogRecPtr RedoRecPtr;
 270
 271 /*
 272  * RedoStartLSN points to the checkpoint's REDO location which is specified
 273  * in a backup label file, backup history file or control file. In standby
 274  * mode, XLOG streaming usually starts from the position where an invalid
 275  * record was found. But if we fail to read even the initial checkpoint
 276  * record, we use the REDO location instead of the checkpoint location as
 277  * the start position of XLOG streaming. Otherwise we would have to jump
 278  * backwards to the REDO location after reading the checkpoint record,
 279  * because the REDO record can precede the checkpoint record.
 280  */
 281 static XLogRecPtr RedoStartLSN = {0, 0};
 282
 283 /*----------
 284  * Shared-memory data structures for XLOG control
 285  *
 286  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 287  * the log up to (all records before that point must be written or fsynced).
 288  * LogwrtResult indicates the byte positions we have already written/fsynced.
 289  * These structs are identical but are declared separately to indicate their
 290  * slightly different functions.
 291  *
 292  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 293  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 294  * this arrangement is that the value can be examined by code that already
 295  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 296  * to the shared variable, each backend has a private copy of LogwrtResult,
 297  * which is updated when convenient.
 298  *
 299  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 300  * (protected by info_lck), but we don't need to cache any copies of it.
 301  *
 302  * info_lck is only held long enough to read/update the protected variables,
 303  * so it's a plain spinlock.  The other locks are held longer (potentially
 304  * over I/O operations), so we use LWLocks for them.  These locks are:
 305  *
 306  * WALInsertLock: must be held to insert a record into the WAL buffers.
 307  *
 308  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 309  * XLogFlush).
 310  *
 311  * ControlFileLock: must be held to read/update control file or create
 312  * new log file.
 313  *
 314  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 315  * only one checkpointer at a time; currently, with all checkpoints done by
 316  * the checkpointer, this is just pro forma).
 317  *
 318  *----------
 319  */
 320
 321 typedef struct XLogwrtRqst
 322 {
 323         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 324         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 325 } XLogwrtRqst;
 326
 327 typedef struct XLogwrtResult
 328 {
 329         XLogRecPtr      Write;                  /* last byte + 1 written out */
 330         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 331 } XLogwrtResult;
 332
 333 /*
 334  * Shared state data for XLogInsert.
 335  */
 336 typedef struct XLogCtlInsert
 337 {
 338         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 339         int                     curridx;                /* current block index in cache */
 340         XLogPageHeader currpage;        /* points to header of block in cache */
 341         char       *currpos;            /* current insertion point in cache */
 342         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 343         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 344
 345         /*
 346          * fullPageWrites is the master copy used by all backends to determine
 347          * whether to write full-page to WAL, instead of using process-local
 348          * one. This is required because, when full_page_writes is changed
 349          * by SIGHUP, we must WAL-log it before it actually affects
 350          * WAL-logging by backends. Checkpointer sets at startup or after SIGHUP.
 351          */
 352         bool            fullPageWrites;
 353
 354         /*
 355          * exclusiveBackup is true if a backup started with pg_start_backup() is
 356          * in progress, and nonExclusiveBackups is a counter indicating the number
 357          * of streaming base backups currently in progress. forcePageWrites is set
 358          * to true when either of these is non-zero. lastBackupStart is the latest
 359          * checkpoint redo location used as a starting point for an online backup.
 360          */
 361         bool            exclusiveBackup;
 362         int                     nonExclusiveBackups;
 363         XLogRecPtr      lastBackupStart;
 364 } XLogCtlInsert;
 365
 366 /*
 367  * Shared state data for XLogWrite/XLogFlush.
 368  */
 369 typedef struct XLogCtlWrite
 370 {
 371         int                     curridx;                /* cache index of next block to write */
 372         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 373 } XLogCtlWrite;
 374
 375 /*
 376  * Total shared-memory state for XLOG.
 377  */
 378 typedef struct XLogCtlData
 379 {
 380         /* Protected by WALInsertLock: */
 381         XLogCtlInsert Insert;
 382
 383         /* Protected by info_lck: */
 384         XLogwrtRqst LogwrtRqst;
 385         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 386         TransactionId ckptXid;
 387         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 388         uint32          lastRemovedLog; /* latest removed/recycled XLOG segment */
 389         uint32          lastRemovedSeg;
 390
 391         /* Protected by WALWriteLock: */
 392         XLogCtlWrite Write;
 393
 394         /*
 395          * Protected by info_lck and WALWriteLock (you must hold either lock to
 396          * read it, but both to update)
 397          */
 398         XLogwrtResult LogwrtResult;
 399
 400         /*
 401          * These values do not change after startup, although the pointed-to pages
 402          * and xlblocks values certainly do.  Permission to read/write the pages
 403          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 404          */
 405         char       *pages;                      /* buffers for unwritten XLOG pages */
 406         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 407         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 408         TimeLineID      ThisTimeLineID;
 409         TimeLineID      RecoveryTargetTLI;
 410
 411         /*
 412          * archiveCleanupCommand is read from recovery.conf but needs to be in
 413          * shared memory so that the checkpointer process can access it.
 414          */
 415         char            archiveCleanupCommand[MAXPGPATH];
 416
 417         /*
 418          * SharedRecoveryInProgress indicates if we're still in crash or archive
 419          * recovery.  Protected by info_lck.
 420          */
 421         bool            SharedRecoveryInProgress;
 422
 423         /*
 424          * SharedHotStandbyActive indicates if we're still in crash or archive
 425          * recovery.  Protected by info_lck.
 426          */
 427         bool            SharedHotStandbyActive;
 428
 429         /*
 430          * recoveryWakeupLatch is used to wake up the startup process to continue
 431          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 432          * to appear.
 433          */
 434         Latch           recoveryWakeupLatch;
 435
 436         /*
 437          * WALWriterLatch is used to wake up the WALWriter to write some WAL.
 438          */
 439         Latch           WALWriterLatch;
 440
 441         /*
 442          * During recovery, we keep a copy of the latest checkpoint record here.
 443          * Used by the background writer when it wants to create a restartpoint.
 444          *
 445          * Protected by info_lck.
 446          */
 447         XLogRecPtr      lastCheckPointRecPtr;
 448         CheckPoint      lastCheckPoint;
 449
 450         /* end+1 of the last record replayed (or being replayed) */
 451         XLogRecPtr      replayEndRecPtr;
 452         /* end+1 of the last record replayed */
 453         XLogRecPtr      recoveryLastRecPtr;
 454         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 455         TimestampTz recoveryLastXTime;
 456         /* timestamp of when we started replaying the current chunk of WAL data,
 457          * only relevant for replication or archive recovery */
 458         TimestampTz currentChunkStartTime;
 459         /* end of the last record restored from the archive */
 460         XLogRecPtr      restoreLastRecPtr;
 461         /* Are we requested to pause recovery? */
 462         bool            recoveryPause;
 463
 464         /*
 465          * lastFpwDisableRecPtr points to the start of the last replayed
 466          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 467          */
 468         XLogRecPtr      lastFpwDisableRecPtr;
 469
 470         slock_t         info_lck;               /* locks shared variables shown above */
 471 } XLogCtlData;
 472
 473 static XLogCtlData *XLogCtl = NULL;
 474
 475 /*
 476  * We maintain an image of pg_control in shared memory.
 477  */
 478 static ControlFileData *ControlFile = NULL;
 479
 480 /*
 481  * Macros for managing XLogInsert state.  In most cases, the calling routine
 482  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 483  * so these are passed as parameters instead of being fetched via XLogCtl.
 484  */
 485
 486 /* Free space remaining in the current xlog page buffer */
 487 #define INSERT_FREESPACE(Insert)  \
 488         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 489
 490 /* Construct XLogRecPtr value for current insertion point */
 491 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 492         ( \
 493           (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
 494           (recptr).xrecoff = \
 495                 XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
 496         )
 497
 498 #define PrevBufIdx(idx)         \
 499                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 500
 501 #define NextBufIdx(idx)         \
 502                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 503
 504 /*
 505  * Private, possibly out-of-date copy of shared LogwrtResult.
 506  * See discussion above.
 507  */
 508 static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 509
 510 /*
 511  * Codes indicating where we got a WAL file from during recovery, or where
 512  * to attempt to get one.  These are chosen so that they can be OR'd together
 513  * in a bitmask state variable.
 514  */
 515 #define XLOG_FROM_ARCHIVE               (1<<0)  /* Restored using restore_command */
 516 #define XLOG_FROM_PG_XLOG               (1<<1)  /* Existing file in pg_xlog */
 517 #define XLOG_FROM_STREAM                (1<<2)  /* Streamed from master */
 518
 519 /*
 520  * openLogFile is -1 or a kernel FD for an open log file segment.
 521  * When it's open, openLogOff is the current seek offset in the file.
 522  * openLogId/openLogSeg identify the segment.  These variables are only
 523  * used to write the XLOG, and so will normally refer to the active segment.
 524  */
 525 static int      openLogFile = -1;
 526 static uint32 openLogId = 0;
 527 static uint32 openLogSeg = 0;
 528 static uint32 openLogOff = 0;
 529
 530 /*
 531  * These variables are used similarly to the ones above, but for reading
 532  * the XLOG.  Note, however, that readOff generally represents the offset
 533  * of the page just read, not the seek position of the FD itself, which
 534  * will be just past that page. readLen indicates how much of the current
 535  * page has been read into readBuf, and readSource indicates where we got
 536  * the currently open file from.
 537  */
 538 static int      readFile = -1;
 539 static uint32 readId = 0;
 540 static uint32 readSeg = 0;
 541 static uint32 readOff = 0;
 542 static uint32 readLen = 0;
 543 static int      readSource = 0;         /* XLOG_FROM_* code */
 544
 545 /*
 546  * Keeps track of which sources we've tried to read the current WAL
 547  * record from and failed.
 548  */
 549 static int      failedSources = 0;      /* OR of XLOG_FROM_* codes */
 550
 551 /*
 552  * These variables track when we last obtained some WAL data to process,
 553  * and where we got it from.  (XLogReceiptSource is initially the same as
 554  * readSource, but readSource gets reset to zero when we don't have data
 555  * to process right now.)
 556  */
 557 static TimestampTz XLogReceiptTime = 0;
 558 static int      XLogReceiptSource = 0;          /* XLOG_FROM_* code */
 559
 560 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 561 static char *readBuf = NULL;
 562
 563 /* Buffer for current ReadRecord result (expandable) */
 564 static char *readRecordBuf = NULL;
 565 static uint32 readRecordBufSize = 0;
 566
 567 /* State information for XLOG reading */
 568 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 569 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 570 static TimeLineID lastPageTLI = 0;
 571
 572 static XLogRecPtr minRecoveryPoint;             /* local copy of
 573                                                                                  * ControlFile->minRecoveryPoint */
 574 static bool updateMinRecoveryPoint = true;
 575
 576 /*
 577  * Have we reached a consistent database state? In crash recovery, we have
 578  * to replay all the WAL, so reachedConsistency is never set. During archive
 579  * recovery, the database is consistent once minRecoveryPoint is reached.
 580  */
 581 bool reachedConsistency = false;
 582
 583 static bool InRedo = false;
 584
 585 /* Have we launched bgwriter during recovery? */
 586 static bool bgwriterLaunched = false;
 587
 588 /*
 589  * Information logged when we detect a change in one of the parameters
 590  * important for Hot Standby.
 591  */
 592 typedef struct xl_parameter_change
 593 {
 594         int                     MaxConnections;
 595         int                     max_prepared_xacts;
 596         int                     max_locks_per_xact;
 597         int                     wal_level;
 598 } xl_parameter_change;
 599
 600 /* logs restore point */
 601 typedef struct xl_restore_point
 602 {
 603         TimestampTz rp_time;
 604         char            rp_name[MAXFNAMELEN];
 605 } xl_restore_point;
 606
 607
 608 static void XLogArchiveNotify(const char *xlog);
 609 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
 610 static bool XLogArchiveCheckDone(const char *xlog);
 611 static bool XLogArchiveIsBusy(const char *xlog);
 612 static void XLogArchiveCleanup(const char *xlog);
 613 static void readRecoveryCommandFile(void);
 614 static void exitArchiveRecovery(TimeLineID endTLI,
 615                                         uint32 endLogId, uint32 endLogSeg);
 616 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 617 static void recoveryPausesHere(void);
 618 static void SetLatestXTime(TimestampTz xtime);
 619 static void SetCurrentChunkStartTime(TimestampTz xtime);
 620 static void CheckRequiredParameterValues(void);
 621 static void XLogReportParameters(void);
 622 static void LocalSetXLogInsertAllowed(void);
 623 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 624 static void KeepLogSeg(XLogRecPtr recptr, uint32 *logId, uint32 *logSeg);
 625
 626 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 627                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 628 static bool AdvanceXLInsertBuffer(bool new_segment);
 629 static bool XLogCheckpointNeeded(uint32 logid, uint32 logseg);
 630 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 631 static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
 632                                            bool find_free, int *max_advance,
 633                                            bool use_lock);
 634 static int XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 635                          int source, bool notexistOk);
 636 static int XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode,
 637                                    int sources);
 638 static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 639                          bool randAccess);
 640 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 641 static void XLogFileClose(void);
 642 static bool RestoreArchivedFile(char *path, const char *xlogfname,
 643                                         const char *recovername, off_t expectedSize);
 644 static void ExecuteRecoveryCommand(char *command, char *commandName,
 645                                            bool failOnerror);
 646 static void PreallocXlogFiles(XLogRecPtr endptr);
 647 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 648 static void UpdateLastRemovedPtr(char *filename);
 649 static void ValidateXLOGDirectoryStructure(void);
 650 static void CleanupBackupHistory(void);
 651 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 652 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
 653 static void CheckRecoveryConsistency(void);
 654 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 655 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 656 static List *readTimeLineHistory(TimeLineID targetTLI);
 657 static bool existsTimeLineHistory(TimeLineID probeTLI);
 658 static bool rescanLatestTimeLine(void);
 659 static TimeLineID findNewestTimeLine(TimeLineID startTLI);
 660 static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 661                                          TimeLineID endTLI,
 662                                          uint32 endLogId, uint32 endLogSeg);
 663 static void WriteControlFile(void);
 664 static void ReadControlFile(void);
 665 static char *str_time(pg_time_t tnow);
 666 static bool CheckForStandbyTrigger(void);
 667
 668 #ifdef WAL_DEBUG
 669 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 670 #endif
 671 static void pg_start_backup_callback(int code, Datum arg);
 672 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 673                                   bool *backupEndRequired, bool *backupFromStandby);
 674 static void rm_redo_error_callback(void *arg);
 675 static int      get_sync_bit(int method);
 676
 677
 678 /*
 679  * Insert an XLOG record having the specified RMID and info bytes,
 680  * with the body of the record being the data chunk(s) described by
 681  * the rdata chain (see xlog.h for notes about rdata).
 682  *
 683  * Returns XLOG pointer to end of record (beginning of next record).
 684  * This can be used as LSN for data pages affected by the logged action.
 685  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 686  * before the data page can be written out.  This implements the basic
 687  * WAL rule "write the log before the data".)
 688  *
 689  * NB: this routine feels free to scribble on the XLogRecData structs,
 690  * though not on the data they reference.  This is OK since the XLogRecData
 691  * structs are always just temporaries in the calling code.
 692  */
 693 XLogRecPtr
 694 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 695 {
 696         XLogCtlInsert *Insert = &XLogCtl->Insert;
 697         XLogRecord *record;
 698         XLogContRecord *contrecord;
 699         XLogRecPtr      RecPtr;
 700         XLogRecPtr      WriteRqst;
 701         uint32          freespace;
 702         int                     curridx;
 703         XLogRecData *rdt;
 704         XLogRecData *rdt_lastnormal;
 705         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 706         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 707         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 708         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 709         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 710         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 711         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 712         pg_crc32        rdata_crc;
 713         uint32          len,
 714                                 write_len;
 715         unsigned        i;
 716         bool            updrqst;
 717         bool            doPageWrites;
 718         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 719         uint8           info_orig = info;
 720
 721         /* cross-check on whether we should be here or not */
 722         if (!XLogInsertAllowed())
 723                 elog(ERROR, "cannot make new WAL entries during recovery");
 724
 725         /* info's high bits are reserved for use by me */
 726         if (info & XLR_INFO_MASK)
 727                 elog(PANIC, "invalid xlog info mask %02X", info);
 728
 729         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 730
 731         /*
 732          * In bootstrap mode, we don't actually log anything but XLOG resources;
 733          * return a phony record pointer.
 734          */
 735         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 736         {
 737                 RecPtr.xlogid = 0;
 738                 RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 739                 return RecPtr;
 740         }
 741
 742         /*
 743          * Here we scan the rdata chain, to determine which buffers must be backed
 744          * up.
 745          *
 746          * We may have to loop back to here if a race condition is detected below.
 747          * We could prevent the race by doing all this work while holding the
 748          * insert lock, but it seems better to avoid doing CRC calculations while
 749          * holding the lock.
 750          *
 751          * We add entries for backup blocks to the chain, so that they don't
 752          * need any special treatment in the critical section where the chunks are
 753          * copied into the WAL buffers. Those entries have to be unlinked from the
 754          * chain if we have to loop back here.
 755          */
 756 begin:;
 757         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 758         {
 759                 dtbuf[i] = InvalidBuffer;
 760                 dtbuf_bkp[i] = false;
 761         }
 762
 763         /*
 764          * Decide if we need to do full-page writes in this XLOG record: true if
 765          * full_page_writes is on or we have a PITR request for it.  Since we
 766          * don't yet have the insert lock, fullPageWrites and forcePageWrites
 767          * could change under us, but we'll recheck them once we have the lock.
 768          */
 769         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 770
 771         len = 0;
 772         for (rdt = rdata;;)
 773         {
 774                 if (rdt->buffer == InvalidBuffer)
 775                 {
 776                         /* Simple data, just include it */
 777                         len += rdt->len;
 778                 }
 779                 else
 780                 {
 781                         /* Find info for buffer */
 782                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 783                         {
 784                                 if (rdt->buffer == dtbuf[i])
 785                                 {
 786                                         /* Buffer already referenced by earlier chain item */
 787                                         if (dtbuf_bkp[i])
 788                                         {
 789                                                 rdt->data = NULL;
 790                                                 rdt->len = 0;
 791                                         }
 792                                         else if (rdt->data)
 793                                                 len += rdt->len;
 794                                         break;
 795                                 }
 796                                 if (dtbuf[i] == InvalidBuffer)
 797                                 {
 798                                         /* OK, put it in this slot */
 799                                         dtbuf[i] = rdt->buffer;
 800                                         if (XLogCheckBuffer(rdt, doPageWrites,
 801                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 802                                         {
 803                                                 dtbuf_bkp[i] = true;
 804                                                 rdt->data = NULL;
 805                                                 rdt->len = 0;
 806                                         }
 807                                         else if (rdt->data)
 808                                                 len += rdt->len;
 809                                         break;
 810                                 }
 811                         }
 812                         if (i >= XLR_MAX_BKP_BLOCKS)
 813                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 814                                          XLR_MAX_BKP_BLOCKS);
 815                 }
 816                 /* Break out of loop when rdt points to last chain item */
 817                 if (rdt->next == NULL)
 818                         break;
 819                 rdt = rdt->next;
 820         }
 821
 822         /*
 823          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 824          * error checking in ReadRecord.  This means that all callers of
 825          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 826          * make an exception for XLOG SWITCH records because we don't want them to
 827          * ever cross a segment boundary.
 828          */
 829         if (len == 0 && !isLogSwitch)
 830                 elog(PANIC, "invalid xlog record length %u", len);
 831
 832         /*
 833          * Make additional rdata chain entries for the backup blocks, so that we
 834          * don't need to special-case them in the write loop.  This modifies the
 835          * original rdata chain, but we keep a pointer to the last regular entry,
 836          * rdt_lastnormal, so that we can undo this if we have to loop back to the
 837          * beginning.
 838          *
 839          * At the exit of this loop, write_len includes the backup block data.
 840          *
 841          * Also set the appropriate info bits to show which buffers were backed
 842          * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
 843          * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
 844          */
 845         rdt_lastnormal = rdt;
 846         write_len = len;
 847         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 848         {
 849                 BkpBlock   *bkpb;
 850                 char       *page;
 851
 852                 if (!dtbuf_bkp[i])
 853                         continue;
 854
 855                 info |= XLR_SET_BKP_BLOCK(i);
 856
 857                 bkpb = &(dtbuf_xlg[i]);
 858                 page = (char *) BufferGetBlock(dtbuf[i]);
 859
 860                 rdt->next = &(dtbuf_rdt1[i]);
 861                 rdt = rdt->next;
 862
 863                 rdt->data = (char *) bkpb;
 864                 rdt->len = sizeof(BkpBlock);
 865                 write_len += sizeof(BkpBlock);
 866
 867                 rdt->next = &(dtbuf_rdt2[i]);
 868                 rdt = rdt->next;
 869
 870                 if (bkpb->hole_length == 0)
 871                 {
 872                         rdt->data = page;
 873                         rdt->len = BLCKSZ;
 874                         write_len += BLCKSZ;
 875                         rdt->next = NULL;
 876                 }
 877                 else
 878                 {
 879                         /* must skip the hole */
 880                         rdt->data = page;
 881                         rdt->len = bkpb->hole_offset;
 882                         write_len += bkpb->hole_offset;
 883
 884                         rdt->next = &(dtbuf_rdt3[i]);
 885                         rdt = rdt->next;
 886
 887                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 888                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 889                         write_len += rdt->len;
 890                         rdt->next = NULL;
 891                 }
 892         }
 893
 894         /*
 895          * Calculate CRC of the data, including all the backup blocks
 896          *
 897          * Note that the record header isn't added into the CRC initially since
 898          * we don't know the prev-link yet.  Thus, the CRC will represent the CRC
 899          * of the whole record in the order: rdata, then backup blocks, then
 900          * record header.
 901          */
 902         INIT_CRC32(rdata_crc);
 903         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
 904                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 905
 906         START_CRIT_SECTION();
 907
 908         /* Now wait to get insert lock */
 909         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 910
 911         /*
 912          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 913          * back and recompute everything.  This can only happen just after a
 914          * checkpoint, so it's better to be slow in this case and fast otherwise.
 915          *
 916          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 917          * affect the contents of the XLOG record, so we'll update our local copy
 918          * but not force a recomputation.
 919          */
 920         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 921         {
 922                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 923                 RedoRecPtr = Insert->RedoRecPtr;
 924
 925                 if (doPageWrites)
 926                 {
 927                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 928                         {
 929                                 if (dtbuf[i] == InvalidBuffer)
 930                                         continue;
 931                                 if (dtbuf_bkp[i] == false &&
 932                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 933                                 {
 934                                         /*
 935                                          * Oops, this buffer now needs to be backed up, but we
 936                                          * didn't think so above.  Start over.
 937                                          */
 938                                         LWLockRelease(WALInsertLock);
 939                                         END_CRIT_SECTION();
 940                                         rdt_lastnormal->next = NULL;
 941                                         info = info_orig;
 942                                         goto begin;
 943                                 }
 944                         }
 945                 }
 946         }
 947
 948         /*
 949          * Also check to see if fullPageWrites or forcePageWrites was just turned on;
 950          * if we weren't already doing full-page writes then go back and recompute.
 951          * (If it was just turned off, we could recompute the record without full pages,
 952          * but we choose not to bother.)
 953          */
 954         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
 955         {
 956                 /* Oops, must redo it with full-page data. */
 957                 LWLockRelease(WALInsertLock);
 958                 END_CRIT_SECTION();
 959                 rdt_lastnormal->next = NULL;
 960                 info = info_orig;
 961                 goto begin;
 962         }
 963
 964         /*
 965          * If there isn't enough space on the current XLOG page for a record
 966          * header, advance to the next page (leaving the unused space as zeroes).
 967          */
 968         updrqst = false;
 969         freespace = INSERT_FREESPACE(Insert);
 970         if (freespace < SizeOfXLogRecord)
 971         {
 972                 updrqst = AdvanceXLInsertBuffer(false);
 973                 freespace = INSERT_FREESPACE(Insert);
 974         }
 975
 976         /* Compute record's XLOG location */
 977         curridx = Insert->curridx;
 978         INSERT_RECPTR(RecPtr, Insert, curridx);
 979
 980         /*
 981          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
 982          * segment, we need not insert it (and don't want to because we'd like
 983          * consecutive switch requests to be no-ops).  Instead, make sure
 984          * everything is written and flushed through the end of the prior segment,
 985          * and return the prior segment's end address.
 986          */
 987         if (isLogSwitch &&
 988                 (RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
 989         {
 990                 /* We can release insert lock immediately */
 991                 LWLockRelease(WALInsertLock);
 992
 993                 RecPtr.xrecoff -= SizeOfXLogLongPHD;
 994                 if (RecPtr.xrecoff == 0)
 995                 {
 996                         /* crossing a logid boundary */
 997                         RecPtr.xlogid -= 1;
 998                         RecPtr.xrecoff = XLogFileSize;
 999                 }
1000
1001                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1002                 LogwrtResult = XLogCtl->LogwrtResult;
1003                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
1004                 {
1005                         XLogwrtRqst FlushRqst;
1006
1007                         FlushRqst.Write = RecPtr;
1008                         FlushRqst.Flush = RecPtr;
1009                         XLogWrite(FlushRqst, false, false);
1010                 }
1011                 LWLockRelease(WALWriteLock);
1012
1013                 END_CRIT_SECTION();
1014
1015                 return RecPtr;
1016         }
1017
1018         /* Insert record header */
1019
1020         record = (XLogRecord *) Insert->currpos;
1021         record->xl_prev = Insert->PrevRecord;
1022         record->xl_xid = GetCurrentTransactionIdIfAny();
1023         record->xl_tot_len = SizeOfXLogRecord + write_len;
1024         record->xl_len = len;           /* doesn't include backup blocks */
1025         record->xl_info = info;
1026         record->xl_rmid = rmid;
1027
1028         /* Now we can finish computing the record's CRC */
1029         COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
1030                            SizeOfXLogRecord - sizeof(pg_crc32));
1031         FIN_CRC32(rdata_crc);
1032         record->xl_crc = rdata_crc;
1033
1034 #ifdef WAL_DEBUG
1035         if (XLOG_DEBUG)
1036         {
1037                 StringInfoData buf;
1038
1039                 initStringInfo(&buf);
1040                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1041                                                  RecPtr.xlogid, RecPtr.xrecoff);
1042                 xlog_outrec(&buf, record);
1043                 if (rdata->data != NULL)
1044                 {
1045                         appendStringInfo(&buf, " - ");
1046                         RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
1047                 }
1048                 elog(LOG, "%s", buf.data);
1049                 pfree(buf.data);
1050         }
1051 #endif
1052
1053         /* Record begin of record in appropriate places */
1054         ProcLastRecPtr = RecPtr;
1055         Insert->PrevRecord = RecPtr;
1056
1057         Insert->currpos += SizeOfXLogRecord;
1058         freespace -= SizeOfXLogRecord;
1059
1060         /*
1061          * Append the data, including backup blocks if any
1062          */
1063         while (write_len)
1064         {
1065                 while (rdata->data == NULL)
1066                         rdata = rdata->next;
1067
1068                 if (freespace > 0)
1069                 {
1070                         if (rdata->len > freespace)
1071                         {
1072                                 memcpy(Insert->currpos, rdata->data, freespace);
1073                                 rdata->data += freespace;
1074                                 rdata->len -= freespace;
1075                                 write_len -= freespace;
1076                         }
1077                         else
1078                         {
1079                                 memcpy(Insert->currpos, rdata->data, rdata->len);
1080                                 freespace -= rdata->len;
1081                                 write_len -= rdata->len;
1082                                 Insert->currpos += rdata->len;
1083                                 rdata = rdata->next;
1084                                 continue;
1085                         }
1086                 }
1087
1088                 /* Use next buffer */
1089                 updrqst = AdvanceXLInsertBuffer(false);
1090                 curridx = Insert->curridx;
1091                 /* Insert cont-record header */
1092                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1093                 contrecord = (XLogContRecord *) Insert->currpos;
1094                 contrecord->xl_rem_len = write_len;
1095                 Insert->currpos += SizeOfXLogContRecord;
1096                 freespace = INSERT_FREESPACE(Insert);
1097         }
1098
1099         /* Ensure next record will be properly aligned */
1100         Insert->currpos = (char *) Insert->currpage +
1101                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
1102         freespace = INSERT_FREESPACE(Insert);
1103
1104         /*
1105          * The recptr I return is the beginning of the *next* record. This will be
1106          * stored as LSN for changed data pages...
1107          */
1108         INSERT_RECPTR(RecPtr, Insert, curridx);
1109
1110         /*
1111          * If the record is an XLOG_SWITCH, we must now write and flush all the
1112          * existing data, and then forcibly advance to the start of the next
1113          * segment.  It's not good to do this I/O while holding the insert lock,
1114          * but there seems too much risk of confusion if we try to release the
1115          * lock sooner.  Fortunately xlog switch needn't be a high-performance
1116          * operation anyway...
1117          */
1118         if (isLogSwitch)
1119         {
1120                 XLogwrtRqst FlushRqst;
1121                 XLogRecPtr      OldSegEnd;
1122
1123                 TRACE_POSTGRESQL_XLOG_SWITCH();
1124
1125                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1126
1127                 /*
1128                  * Flush through the end of the page containing XLOG_SWITCH, and
1129                  * perform end-of-segment actions (eg, notifying archiver).
1130                  */
1131                 WriteRqst = XLogCtl->xlblocks[curridx];
1132                 FlushRqst.Write = WriteRqst;
1133                 FlushRqst.Flush = WriteRqst;
1134                 XLogWrite(FlushRqst, false, true);
1135
1136                 /* Set up the next buffer as first page of next segment */
1137                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
1138                 (void) AdvanceXLInsertBuffer(true);
1139
1140                 /* There should be no unwritten data */
1141                 curridx = Insert->curridx;
1142                 Assert(curridx == XLogCtl->Write.curridx);
1143
1144                 /* Compute end address of old segment */
1145                 OldSegEnd = XLogCtl->xlblocks[curridx];
1146                 OldSegEnd.xrecoff -= XLOG_BLCKSZ;
1147                 if (OldSegEnd.xrecoff == 0)
1148                 {
1149                         /* crossing a logid boundary */
1150                         OldSegEnd.xlogid -= 1;
1151                         OldSegEnd.xrecoff = XLogFileSize;
1152                 }
1153
1154                 /* Make it look like we've written and synced all of old segment */
1155                 LogwrtResult.Write = OldSegEnd;
1156                 LogwrtResult.Flush = OldSegEnd;
1157
1158                 /*
1159                  * Update shared-memory status --- this code should match XLogWrite
1160                  */
1161                 {
1162                         /* use volatile pointer to prevent code rearrangement */
1163                         volatile XLogCtlData *xlogctl = XLogCtl;
1164
1165                         SpinLockAcquire(&xlogctl->info_lck);
1166                         xlogctl->LogwrtResult = LogwrtResult;
1167                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1168                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1169                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1170                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1171                         SpinLockRelease(&xlogctl->info_lck);
1172                 }
1173
1174                 LWLockRelease(WALWriteLock);
1175
1176                 updrqst = false;                /* done already */
1177         }
1178         else
1179         {
1180                 /* normal case, ie not xlog switch */
1181
1182                 /* Need to update shared LogwrtRqst if some block was filled up */
1183                 if (freespace < SizeOfXLogRecord)
1184                 {
1185                         /* curridx is filled and available for writing out */
1186                         updrqst = true;
1187                 }
1188                 else
1189                 {
1190                         /* if updrqst already set, write through end of previous buf */
1191                         curridx = PrevBufIdx(curridx);
1192                 }
1193                 WriteRqst = XLogCtl->xlblocks[curridx];
1194         }
1195
1196         LWLockRelease(WALInsertLock);
1197
1198         if (updrqst)
1199         {
1200                 /* use volatile pointer to prevent code rearrangement */
1201                 volatile XLogCtlData *xlogctl = XLogCtl;
1202
1203                 SpinLockAcquire(&xlogctl->info_lck);
1204                 /* advance global request to include new block(s) */
1205                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
1206                         xlogctl->LogwrtRqst.Write = WriteRqst;
1207                 /* update local result copy while I have the chance */
1208                 LogwrtResult = xlogctl->LogwrtResult;
1209                 SpinLockRelease(&xlogctl->info_lck);
1210         }
1211
1212         XactLastRecEnd = RecPtr;
1213
1214         END_CRIT_SECTION();
1215
1216         return RecPtr;
1217 }
1218
1219 /*
1220  * Determine whether the buffer referenced by an XLogRecData item has to
1221  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1222  * save the buffer's LSN at *lsn.
1223  */
1224 static bool
1225 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1226                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1227 {
1228         Page            page;
1229
1230         page = BufferGetPage(rdata->buffer);
1231
1232         /*
1233          * XXX We assume page LSN is first data on *every* page that can be passed
1234          * to XLogInsert, whether it otherwise has the standard page layout or
1235          * not.
1236          */
1237         *lsn = PageGetLSN(page);
1238
1239         if (doPageWrites &&
1240                 XLByteLE(PageGetLSN(page), RedoRecPtr))
1241         {
1242                 /*
1243                  * The page needs to be backed up, so set up *bkpb
1244                  */
1245                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1246
1247                 if (rdata->buffer_std)
1248                 {
1249                         /* Assume we can omit data between pd_lower and pd_upper */
1250                         uint16          lower = ((PageHeader) page)->pd_lower;
1251                         uint16          upper = ((PageHeader) page)->pd_upper;
1252
1253                         if (lower >= SizeOfPageHeaderData &&
1254                                 upper > lower &&
1255                                 upper <= BLCKSZ)
1256                         {
1257                                 bkpb->hole_offset = lower;
1258                                 bkpb->hole_length = upper - lower;
1259                         }
1260                         else
1261                         {
1262                                 /* No "hole" to compress out */
1263                                 bkpb->hole_offset = 0;
1264                                 bkpb->hole_length = 0;
1265                         }
1266                 }
1267                 else
1268                 {
1269                         /* Not a standard page header, don't try to eliminate "hole" */
1270                         bkpb->hole_offset = 0;
1271                         bkpb->hole_length = 0;
1272                 }
1273
1274                 return true;                    /* buffer requires backup */
1275         }
1276
1277         return false;                           /* buffer does not need to be backed up */
1278 }
1279
1280 /*
1281  * XLogArchiveNotify
1282  *
1283  * Create an archive notification file
1284  *
1285  * The name of the notification file is the message that will be picked up
1286  * by the archiver, e.g. we write 0000000100000001000000C6.ready
1287  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1288  * then when complete, rename it to 0000000100000001000000C6.done
1289  */
1290 static void
1291 XLogArchiveNotify(const char *xlog)
1292 {
1293         char            archiveStatusPath[MAXPGPATH];
1294         FILE       *fd;
1295
1296         /* insert an otherwise empty file called <XLOG>.ready */
1297         StatusFilePath(archiveStatusPath, xlog, ".ready");
1298         fd = AllocateFile(archiveStatusPath, "w");
1299         if (fd == NULL)
1300         {
1301                 ereport(LOG,
1302                                 (errcode_for_file_access(),
1303                                  errmsg("could not create archive status file \"%s\": %m",
1304                                                 archiveStatusPath)));
1305                 return;
1306         }
1307         if (FreeFile(fd))
1308         {
1309                 ereport(LOG,
1310                                 (errcode_for_file_access(),
1311                                  errmsg("could not write archive status file \"%s\": %m",
1312                                                 archiveStatusPath)));
1313                 return;
1314         }
1315
1316         /* Notify archiver that it's got something to do */
1317         if (IsUnderPostmaster)
1318                 SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
1319 }
1320
1321 /*
1322  * Convenience routine to notify using log/seg representation of filename
1323  */
1324 static void
1325 XLogArchiveNotifySeg(uint32 log, uint32 seg)
1326 {
1327         char            xlog[MAXFNAMELEN];
1328
1329         XLogFileName(xlog, ThisTimeLineID, log, seg);
1330         XLogArchiveNotify(xlog);
1331 }
1332
1333 /*
1334  * XLogArchiveCheckDone
1335  *
1336  * This is called when we are ready to delete or recycle an old XLOG segment
1337  * file or backup history file.  If it is okay to delete it then return true.
1338  * If it is not time to delete it, make sure a .ready file exists, and return
1339  * false.
1340  *
1341  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1342  * then return false; else create <XLOG>.ready and return false.
1343  *
1344  * The reason we do things this way is so that if the original attempt to
1345  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1346  */
1347 static bool
1348 XLogArchiveCheckDone(const char *xlog)
1349 {
1350         char            archiveStatusPath[MAXPGPATH];
1351         struct stat stat_buf;
1352
1353         /* Always deletable if archiving is off */
1354         if (!XLogArchivingActive())
1355                 return true;
1356
1357         /* First check for .done --- this means archiver is done with it */
1358         StatusFilePath(archiveStatusPath, xlog, ".done");
1359         if (stat(archiveStatusPath, &stat_buf) == 0)
1360                 return true;
1361
1362         /* check for .ready --- this means archiver is still busy with it */
1363         StatusFilePath(archiveStatusPath, xlog, ".ready");
1364         if (stat(archiveStatusPath, &stat_buf) == 0)
1365                 return false;
1366
1367         /* Race condition --- maybe archiver just finished, so recheck */
1368         StatusFilePath(archiveStatusPath, xlog, ".done");
1369         if (stat(archiveStatusPath, &stat_buf) == 0)
1370                 return true;
1371
1372         /* Retry creation of the .ready file */
1373         XLogArchiveNotify(xlog);
1374         return false;
1375 }
1376
1377 /*
1378  * XLogArchiveIsBusy
1379  *
1380  * Check to see if an XLOG segment file is still unarchived.
1381  * This is almost but not quite the inverse of XLogArchiveCheckDone: in
1382  * the first place we aren't chartered to recreate the .ready file, and
1383  * in the second place we should consider that if the file is already gone
1384  * then it's not busy.  (This check is needed to handle the race condition
1385  * that a checkpoint already deleted the no-longer-needed file.)
1386  */
1387 static bool
1388 XLogArchiveIsBusy(const char *xlog)
1389 {
1390         char            archiveStatusPath[MAXPGPATH];
1391         struct stat stat_buf;
1392
1393         /* First check for .done --- this means archiver is done with it */
1394         StatusFilePath(archiveStatusPath, xlog, ".done");
1395         if (stat(archiveStatusPath, &stat_buf) == 0)
1396                 return false;
1397
1398         /* check for .ready --- this means archiver is still busy with it */
1399         StatusFilePath(archiveStatusPath, xlog, ".ready");
1400         if (stat(archiveStatusPath, &stat_buf) == 0)
1401                 return true;
1402
1403         /* Race condition --- maybe archiver just finished, so recheck */
1404         StatusFilePath(archiveStatusPath, xlog, ".done");
1405         if (stat(archiveStatusPath, &stat_buf) == 0)
1406                 return false;
1407
1408         /*
1409          * Check to see if the WAL file has been removed by checkpoint, which
1410          * implies it has already been archived, and explains why we can't see a
1411          * status file for it.
1412          */
1413         snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
1414         if (stat(archiveStatusPath, &stat_buf) != 0 &&
1415                 errno == ENOENT)
1416                 return false;
1417
1418         return true;
1419 }
1420
1421 /*
1422  * XLogArchiveCleanup
1423  *
1424  * Cleanup archive notification file(s) for a particular xlog segment
1425  */
1426 static void
1427 XLogArchiveCleanup(const char *xlog)
1428 {
1429         char            archiveStatusPath[MAXPGPATH];
1430
1431         /* Remove the .done file */
1432         StatusFilePath(archiveStatusPath, xlog, ".done");
1433         unlink(archiveStatusPath);
1434         /* should we complain about failure? */
1435
1436         /* Remove the .ready file if present --- normally it shouldn't be */
1437         StatusFilePath(archiveStatusPath, xlog, ".ready");
1438         unlink(archiveStatusPath);
1439         /* should we complain about failure? */
1440 }
1441
1442 /*
1443  * Advance the Insert state to the next buffer page, writing out the next
1444  * buffer if it still contains unwritten data.
1445  *
1446  * If new_segment is TRUE then we set up the next buffer page as the first
1447  * page of the next xlog segment file, possibly but not usually the next
1448  * consecutive file page.
1449  *
1450  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1451  * just-filled page.  If we can do this for free (without an extra lock),
1452  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1453  * request update still needs to be done, FALSE if we did it internally.
1454  *
1455  * Must be called with WALInsertLock held.
1456  */
1457 static bool
1458 AdvanceXLInsertBuffer(bool new_segment)
1459 {
1460         XLogCtlInsert *Insert = &XLogCtl->Insert;
1461         int                     nextidx = NextBufIdx(Insert->curridx);
1462         bool            update_needed = true;
1463         XLogRecPtr      OldPageRqstPtr;
1464         XLogwrtRqst WriteRqst;
1465         XLogRecPtr      NewPageEndPtr;
1466         XLogPageHeader NewPage;
1467
1468         /*
1469          * Get ending-offset of the buffer page we need to replace (this may be
1470          * zero if the buffer hasn't been used yet).  Fall through if it's already
1471          * written out.
1472          */
1473         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1474         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1475         {
1476                 /* nope, got work to do... */
1477                 XLogRecPtr      FinishedPageRqstPtr;
1478
1479                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1480
1481                 /* Before waiting, get info_lck and update LogwrtResult */
1482                 {
1483                         /* use volatile pointer to prevent code rearrangement */
1484                         volatile XLogCtlData *xlogctl = XLogCtl;
1485
1486                         SpinLockAcquire(&xlogctl->info_lck);
1487                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
1488                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1489                         LogwrtResult = xlogctl->LogwrtResult;
1490                         SpinLockRelease(&xlogctl->info_lck);
1491                 }
1492
1493                 update_needed = false;  /* Did the shared-request update */
1494
1495                 /*
1496                  * Now that we have an up-to-date LogwrtResult value, see if we still
1497                  * need to write it or if someone else already did.
1498                  */
1499                 if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1500                 {
1501                         /* Must acquire write lock */
1502                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1503                         LogwrtResult = XLogCtl->LogwrtResult;
1504                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1505                         {
1506                                 /* OK, someone wrote it already */
1507                                 LWLockRelease(WALWriteLock);
1508                         }
1509                         else
1510                         {
1511                                 /*
1512                                  * Have to write buffers while holding insert lock. This is
1513                                  * not good, so only write as much as we absolutely must.
1514                                  */
1515                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1516                                 WriteRqst.Write = OldPageRqstPtr;
1517                                 WriteRqst.Flush.xlogid = 0;
1518                                 WriteRqst.Flush.xrecoff = 0;
1519                                 XLogWrite(WriteRqst, false, false);
1520                                 LWLockRelease(WALWriteLock);
1521                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1522                         }
1523                 }
1524         }
1525
1526         /*
1527          * Now the next buffer slot is free and we can set it up to be the next
1528          * output page.
1529          */
1530         NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1531
1532         if (new_segment)
1533         {
1534                 /* force it to a segment start point */
1535                 NewPageEndPtr.xrecoff += XLogSegSize - 1;
1536                 NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
1537         }
1538
1539         if (NewPageEndPtr.xrecoff >= XLogFileSize)
1540         {
1541                 /* crossing a logid boundary */
1542                 NewPageEndPtr.xlogid += 1;
1543                 NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1544         }
1545         else
1546                 NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1547         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1548         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1549
1550         Insert->curridx = nextidx;
1551         Insert->currpage = NewPage;
1552
1553         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1554
1555         /*
1556          * Be sure to re-zero the buffer so that bytes beyond what we've written
1557          * will look like zeroes and not valid XLOG records...
1558          */
1559         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1560
1561         /*
1562          * Fill the new page's header
1563          */
1564         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1565
1566         /* NewPage->xlp_info = 0; */    /* done by memset */
1567         NewPage   ->xlp_tli = ThisTimeLineID;
1568         NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1569         NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
1570
1571         /*
1572          * If online backup is not in progress, mark the header to indicate that
1573          * WAL records beginning in this page have removable backup blocks.  This
1574          * allows the WAL archiver to know whether it is safe to compress archived
1575          * WAL data by transforming full-block records into the non-full-block
1576          * format.  It is sufficient to record this at the page level because we
1577          * force a page switch (in fact a segment switch) when starting a backup,
1578          * so the flag will be off before any records can be written during the
1579          * backup.  At the end of a backup, the last page will be marked as all
1580          * unsafe when perhaps only part is unsafe, but at worst the archiver
1581          * would miss the opportunity to compress a few records.
1582          */
1583         if (!Insert->forcePageWrites)
1584                 NewPage->xlp_info |= XLP_BKP_REMOVABLE;
1585
1586         /*
1587          * If first page of an XLOG segment file, make it a long header.
1588          */
1589         if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
1590         {
1591                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1592
1593                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1594                 NewLongPage->xlp_seg_size = XLogSegSize;
1595                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1596                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1597
1598                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1599         }
1600
1601         return update_needed;
1602 }
1603
1604 /*
1605  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1606  *
1607  * logid/logseg indicate a log file that has just been filled up (or read
1608  * during recovery). We measure the distance from RedoRecPtr to logid/logseg
1609  * and see if that exceeds CheckPointSegments.
1610  *
1611  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1612  */
1613 static bool
1614 XLogCheckpointNeeded(uint32 logid, uint32 logseg)
1615 {
1616         /*
1617          * A straight computation of segment number could overflow 32 bits. Rather
1618          * than assuming we have working 64-bit arithmetic, we compare the
1619          * highest-order bits separately, and force a checkpoint immediately when
1620          * they change.
1621          */
1622         uint32          old_segno,
1623                                 new_segno;
1624         uint32          old_highbits,
1625                                 new_highbits;
1626
1627         old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
1628                 (RedoRecPtr.xrecoff / XLogSegSize);
1629         old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1630         new_segno = (logid % XLogSegSize) * XLogSegsPerFile + logseg;
1631         new_highbits = logid / XLogSegSize;
1632         if (new_highbits != old_highbits ||
1633                 new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1634                 return true;
1635         return false;
1636 }
1637
1638 /*
1639  * Write and/or fsync the log at least as far as WriteRqst indicates.
1640  *
1641  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1642  * may stop at any convenient boundary (such as a cache or logfile boundary).
1643  * This option allows us to avoid uselessly issuing multiple writes when a
1644  * single one would do.
1645  *
1646  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1647  * perform end-of-segment actions after writing the last page, even if
1648  * it's not physically the end of its segment.  (NB: this will work properly
1649  * only if caller specifies WriteRqst == page-end and flexible == false,
1650  * and there is some data to write.)
1651  *
1652  * Must be called with WALWriteLock held.
1653  */
1654 static void
1655 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1656 {
1657         XLogCtlWrite *Write = &XLogCtl->Write;
1658         bool            ispartialpage;
1659         bool            last_iteration;
1660         bool            finishing_seg;
1661         bool            use_existent;
1662         int                     curridx;
1663         int                     npages;
1664         int                     startidx;
1665         uint32          startoffset;
1666
1667         /* We should always be inside a critical section here */
1668         Assert(CritSectionCount > 0);
1669
1670         /*
1671          * Update local LogwrtResult (caller probably did this already, but...)
1672          */
1673         LogwrtResult = XLogCtl->LogwrtResult;
1674
1675         /*
1676          * Since successive pages in the xlog cache are consecutively allocated,
1677          * we can usually gather multiple pages together and issue just one
1678          * write() call.  npages is the number of pages we have determined can be
1679          * written together; startidx is the cache block index of the first one,
1680          * and startoffset is the file offset at which it should go. The latter
1681          * two variables are only valid when npages > 0, but we must initialize
1682          * all of them to keep the compiler quiet.
1683          */
1684         npages = 0;
1685         startidx = 0;
1686         startoffset = 0;
1687
1688         /*
1689          * Within the loop, curridx is the cache block index of the page to
1690          * consider writing.  We advance Write->curridx only after successfully
1691          * writing pages.  (Right now, this refinement is useless since we are
1692          * going to PANIC if any error occurs anyway; but someday it may come in
1693          * useful.)
1694          */
1695         curridx = Write->curridx;
1696
1697         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1698         {
1699                 /*
1700                  * Make sure we're not ahead of the insert process.  This could happen
1701                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1702                  * last page that's been initialized by AdvanceXLInsertBuffer.
1703                  */
1704                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1705                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1706                                  LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1707                                  XLogCtl->xlblocks[curridx].xlogid,
1708                                  XLogCtl->xlblocks[curridx].xrecoff);
1709
1710                 /* Advance LogwrtResult.Write to end of current buffer page */
1711                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1712                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
1713
1714                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1715                 {
1716                         /*
1717                          * Switch to new logfile segment.  We cannot have any pending
1718                          * pages here (since we dump what we have at segment end).
1719                          */
1720                         Assert(npages == 0);
1721                         if (openLogFile >= 0)
1722                                 XLogFileClose();
1723                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1724
1725                         /* create/use new log file */
1726                         use_existent = true;
1727                         openLogFile = XLogFileInit(openLogId, openLogSeg,
1728                                                                            &use_existent, true);
1729                         openLogOff = 0;
1730                 }
1731
1732                 /* Make sure we have the current logfile open */
1733                 if (openLogFile < 0)
1734                 {
1735                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1736                         openLogFile = XLogFileOpen(openLogId, openLogSeg);
1737                         openLogOff = 0;
1738                 }
1739
1740                 /* Add current page to the set of pending pages-to-dump */
1741                 if (npages == 0)
1742                 {
1743                         /* first of group */
1744                         startidx = curridx;
1745                         startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1746                 }
1747                 npages++;
1748
1749                 /*
1750                  * Dump the set if this will be the last loop iteration, or if we are
1751                  * at the last page of the cache area (since the next page won't be
1752                  * contiguous in memory), or if we are at the end of the logfile
1753                  * segment.
1754                  */
1755                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
1756
1757                 finishing_seg = !ispartialpage &&
1758                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1759
1760                 if (last_iteration ||
1761                         curridx == XLogCtl->XLogCacheBlck ||
1762                         finishing_seg)
1763                 {
1764                         char       *from;
1765                         Size            nbytes;
1766
1767                         /* Need to seek in the file? */
1768                         if (openLogOff != startoffset)
1769                         {
1770                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1771                                         ereport(PANIC,
1772                                                         (errcode_for_file_access(),
1773                                                          errmsg("could not seek in log file %u, "
1774                                                                         "segment %u to offset %u: %m",
1775                                                                         openLogId, openLogSeg, startoffset)));
1776                                 openLogOff = startoffset;
1777                         }
1778
1779                         /* OK to write the page(s) */
1780                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1781                         nbytes = npages * (Size) XLOG_BLCKSZ;
1782                         errno = 0;
1783                         if (write(openLogFile, from, nbytes) != nbytes)
1784                         {
1785                                 /* if write didn't set errno, assume no disk space */
1786                                 if (errno == 0)
1787                                         errno = ENOSPC;
1788                                 ereport(PANIC,
1789                                                 (errcode_for_file_access(),
1790                                                  errmsg("could not write to log file %u, segment %u "
1791                                                                 "at offset %u, length %lu: %m",
1792                                                                 openLogId, openLogSeg,
1793                                                                 openLogOff, (unsigned long) nbytes)));
1794                         }
1795
1796                         /* Update state for write */
1797                         openLogOff += nbytes;
1798                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1799                         npages = 0;
1800
1801                         /*
1802                          * If we just wrote the whole last page of a logfile segment,
1803                          * fsync the segment immediately.  This avoids having to go back
1804                          * and re-open prior segments when an fsync request comes along
1805                          * later. Doing it here ensures that one and only one backend will
1806                          * perform this fsync.
1807                          *
1808                          * We also do this if this is the last page written for an xlog
1809                          * switch.
1810                          *
1811                          * This is also the right place to notify the Archiver that the
1812                          * segment is ready to copy to archival storage, and to update the
1813                          * timer for archive_timeout, and to signal for a checkpoint if
1814                          * too many logfile segments have been used since the last
1815                          * checkpoint.
1816                          */
1817                         if (finishing_seg || (xlog_switch && last_iteration))
1818                         {
1819                                 issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
1820                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1821
1822                                 if (XLogArchivingActive())
1823                                         XLogArchiveNotifySeg(openLogId, openLogSeg);
1824
1825                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1826
1827                                 /*
1828                                  * Request a checkpoint if we've consumed too
1829                                  * much xlog since the last one.  For speed, we first check
1830                                  * using the local copy of RedoRecPtr, which might be out of
1831                                  * date; if it looks like a checkpoint is needed, forcibly
1832                                  * update RedoRecPtr and recheck.
1833                                  */
1834                                 if (IsUnderPostmaster &&
1835                                         XLogCheckpointNeeded(openLogId, openLogSeg))
1836                                 {
1837                                         (void) GetRedoRecPtr();
1838                                         if (XLogCheckpointNeeded(openLogId, openLogSeg))
1839                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1840                                 }
1841                         }
1842                 }
1843
1844                 if (ispartialpage)
1845                 {
1846                         /* Only asked to write a partial page */
1847                         LogwrtResult.Write = WriteRqst.Write;
1848                         break;
1849                 }
1850                 curridx = NextBufIdx(curridx);
1851
1852                 /* If flexible, break out of loop as soon as we wrote something */
1853                 if (flexible && npages == 0)
1854                         break;
1855         }
1856
1857         Assert(npages == 0);
1858         Assert(curridx == Write->curridx);
1859
1860         /*
1861          * If asked to flush, do so
1862          */
1863         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
1864                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1865         {
1866                 /*
1867                  * Could get here without iterating above loop, in which case we might
1868                  * have no open file or the wrong one.  However, we do not need to
1869                  * fsync more than one file.
1870                  */
1871                 if (sync_method != SYNC_METHOD_OPEN &&
1872                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1873                 {
1874                         if (openLogFile >= 0 &&
1875                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1876                                 XLogFileClose();
1877                         if (openLogFile < 0)
1878                         {
1879                                 XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1880                                 openLogFile = XLogFileOpen(openLogId, openLogSeg);
1881                                 openLogOff = 0;
1882                         }
1883                         issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
1884                 }
1885                 LogwrtResult.Flush = LogwrtResult.Write;
1886         }
1887
1888         /*
1889          * Update shared-memory status
1890          *
1891          * We make sure that the shared 'request' values do not fall behind the
1892          * 'result' values.  This is not absolutely essential, but it saves some
1893          * code in a couple of places.
1894          */
1895         {
1896                 /* use volatile pointer to prevent code rearrangement */
1897                 volatile XLogCtlData *xlogctl = XLogCtl;
1898
1899                 SpinLockAcquire(&xlogctl->info_lck);
1900                 xlogctl->LogwrtResult = LogwrtResult;
1901                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1902                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1903                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1904                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1905                 SpinLockRelease(&xlogctl->info_lck);
1906         }
1907 }
1908
1909 /*
1910  * Record the LSN for an asynchronous transaction commit/abort
1911  * and nudge the WALWriter if there is a complete page to write.
1912  * (This should not be called for for synchronous commits.)
1913  */
1914 void
1915 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
1916 {
1917         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
1918
1919         /* use volatile pointer to prevent code rearrangement */
1920         volatile XLogCtlData *xlogctl = XLogCtl;
1921
1922         SpinLockAcquire(&xlogctl->info_lck);
1923         LogwrtResult = xlogctl->LogwrtResult;
1924         if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN))
1925                 xlogctl->asyncXactLSN = asyncXactLSN;
1926         SpinLockRelease(&xlogctl->info_lck);
1927
1928         /* back off to last completed page boundary */
1929         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
1930
1931         /* if we have already flushed that far, we're done */
1932         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1933                 return;
1934
1935         /*
1936          * Nudge the WALWriter if we have a full page of WAL to write.
1937          */
1938         SetLatch(&XLogCtl->WALWriterLatch);
1939 }
1940
1941 /*
1942  * Advance minRecoveryPoint in control file.
1943  *
1944  * If we crash during recovery, we must reach this point again before the
1945  * database is consistent.
1946  *
1947  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1948  * is only updated if it's not already greater than or equal to 'lsn'.
1949  */
1950 static void
1951 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
1952 {
1953         /* Quick check using our local copy of the variable */
1954         if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
1955                 return;
1956
1957         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1958
1959         /* update local copy */
1960         minRecoveryPoint = ControlFile->minRecoveryPoint;
1961
1962         /*
1963          * An invalid minRecoveryPoint means that we need to recover all the WAL,
1964          * i.e., we're doing crash recovery.  We never modify the control file's
1965          * value in that case, so we can short-circuit future checks here too.
1966          */
1967         if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
1968                 updateMinRecoveryPoint = false;
1969         else if (force || XLByteLT(minRecoveryPoint, lsn))
1970         {
1971                 /* use volatile pointer to prevent code rearrangement */
1972                 volatile XLogCtlData *xlogctl = XLogCtl;
1973                 XLogRecPtr      newMinRecoveryPoint;
1974
1975                 /*
1976                  * To avoid having to update the control file too often, we update it
1977                  * all the way to the last record being replayed, even though 'lsn'
1978                  * would suffice for correctness.  This also allows the 'force' case
1979                  * to not need a valid 'lsn' value.
1980                  *
1981                  * Another important reason for doing it this way is that the passed
1982                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
1983                  * the caller got it from a corrupted heap page.  Accepting such a
1984                  * value as the min recovery point would prevent us from coming up at
1985                  * all.  Instead, we just log a warning and continue with recovery.
1986                  * (See also the comments about corrupt LSNs in XLogFlush.)
1987                  */
1988                 SpinLockAcquire(&xlogctl->info_lck);
1989                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
1990                 SpinLockRelease(&xlogctl->info_lck);
1991
1992                 if (!force && XLByteLT(newMinRecoveryPoint, lsn))
1993                         elog(WARNING,
1994                            "xlog min recovery request %X/%X is past current point %X/%X",
1995                                  lsn.xlogid, lsn.xrecoff,
1996                                  newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);
1997
1998                 /* update control file */
1999                 if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
2000                 {
2001                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2002                         UpdateControlFile();
2003                         minRecoveryPoint = newMinRecoveryPoint;
2004
2005                         ereport(DEBUG2,
2006                                         (errmsg("updated min recovery point to %X/%X",
2007                                                 minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
2008                 }
2009         }
2010         LWLockRelease(ControlFileLock);
2011 }
2012
2013 /*
2014  * Ensure that all XLOG data through the given position is flushed to disk.
2015  *
2016  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2017  * already held, and we try to avoid acquiring it if possible.
2018  */
2019 void
2020 XLogFlush(XLogRecPtr record)
2021 {
2022         XLogRecPtr      WriteRqstPtr;
2023         XLogwrtRqst WriteRqst;
2024
2025         /*
2026          * During REDO, we are reading not writing WAL.  Therefore, instead of
2027          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2028          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2029          * to act this way too, and because when it tries to write the
2030          * end-of-recovery checkpoint, it should indeed flush.
2031          */
2032         if (!XLogInsertAllowed())
2033         {
2034                 UpdateMinRecoveryPoint(record, false);
2035                 return;
2036         }
2037
2038         /* Quick exit if already known flushed */
2039         if (XLByteLE(record, LogwrtResult.Flush))
2040                 return;
2041
2042 #ifdef WAL_DEBUG
2043         if (XLOG_DEBUG)
2044                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2045                          record.xlogid, record.xrecoff,
2046                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
2047                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2048 #endif
2049
2050         START_CRIT_SECTION();
2051
2052         /*
2053          * Since fsync is usually a horribly expensive operation, we try to
2054          * piggyback as much data as we can on each fsync: if we see any more data
2055          * entered into the xlog buffer, we'll write and fsync that too, so that
2056          * the final value of LogwrtResult.Flush is as large as possible. This
2057          * gives us some chance of avoiding another fsync immediately after.
2058          */
2059
2060         /* initialize to given target; may increase below */
2061         WriteRqstPtr = record;
2062
2063         /*
2064          * Now wait until we get the write lock, or someone else does the
2065          * flush for us.
2066          */
2067         for (;;)
2068         {
2069                 /* use volatile pointer to prevent code rearrangement */
2070                 volatile XLogCtlData *xlogctl = XLogCtl;
2071
2072                 /* read LogwrtResult and update local state */
2073                 SpinLockAcquire(&xlogctl->info_lck);
2074                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
2075                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2076                 LogwrtResult = xlogctl->LogwrtResult;
2077                 SpinLockRelease(&xlogctl->info_lck);
2078
2079                 /* done already? */
2080                 if (XLByteLE(record, LogwrtResult.Flush))
2081                         break;
2082
2083                 /*
2084                  * Try to get the write lock. If we can't get it immediately, wait
2085                  * until it's released, and recheck if we still need to do the flush
2086                  * or if the backend that held the lock did it for us already. This
2087                  * helps to maintain a good rate of group committing when the system
2088                  * is bottlenecked by the speed of fsyncing.
2089                  */
2090                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2091                 {
2092                         /*
2093                          * The lock is now free, but we didn't acquire it yet. Before we
2094                          * do, loop back to check if someone else flushed the record for
2095                          * us already.
2096                          */
2097                         continue;
2098                 }
2099                 /* Got the lock */
2100                 LogwrtResult = XLogCtl->LogwrtResult;
2101                 if (!XLByteLE(record, LogwrtResult.Flush))
2102                 {
2103                         /* try to write/flush later additions to XLOG as well */
2104                         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
2105                         {
2106                                 XLogCtlInsert *Insert = &XLogCtl->Insert;
2107                                 uint32          freespace = INSERT_FREESPACE(Insert);
2108
2109                                 if (freespace < SizeOfXLogRecord)               /* buffer is full */
2110                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2111                                 else
2112                                 {
2113                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2114                                         WriteRqstPtr.xrecoff -= freespace;
2115                                 }
2116                                 LWLockRelease(WALInsertLock);
2117                                 WriteRqst.Write = WriteRqstPtr;
2118                                 WriteRqst.Flush = WriteRqstPtr;
2119                         }
2120                         else
2121                         {
2122                                 WriteRqst.Write = WriteRqstPtr;
2123                                 WriteRqst.Flush = record;
2124                         }
2125                         XLogWrite(WriteRqst, false, false);
2126                 }
2127                 LWLockRelease(WALWriteLock);
2128                 /* done */
2129                 break;
2130         }
2131
2132         END_CRIT_SECTION();
2133
2134         /*
2135          * If we still haven't flushed to the request point then we have a
2136          * problem; most likely, the requested flush point is past end of XLOG.
2137          * This has been seen to occur when a disk page has a corrupted LSN.
2138          *
2139          * Formerly we treated this as a PANIC condition, but that hurts the
2140          * system's robustness rather than helping it: we do not want to take down
2141          * the whole system due to corruption on one data page.  In particular, if
2142          * the bad page is encountered again during recovery then we would be
2143          * unable to restart the database at all!  (This scenario actually
2144          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2145          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2146          * the only time we can reach here during recovery is while flushing the
2147          * end-of-recovery checkpoint record, and we don't expect that to have a
2148          * bad LSN.
2149          *
2150          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2151          * since xact.c calls this routine inside a critical section.  However,
2152          * calls from bufmgr.c are not within critical sections and so we will not
2153          * force a restart for a bad LSN on a data page.
2154          */
2155         if (XLByteLT(LogwrtResult.Flush, record))
2156                 elog(ERROR,
2157                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2158                          record.xlogid, record.xrecoff,
2159                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2160 }
2161
2162 /*
2163  * Flush xlog, but without specifying exactly where to flush to.
2164  *
2165  * We normally flush only completed blocks; but if there is nothing to do on
2166  * that basis, we check for unflushed async commits in the current incomplete
2167  * block, and flush through the latest one of those.  Thus, if async commits
2168  * are not being used, we will flush complete blocks only.      We can guarantee
2169  * that async commits reach disk after at most three cycles; normally only
2170  * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
2171  * at the end of the buffer ring; this makes a difference only with very high
2172  * load or long wal_writer_delay, but imposes one extra cycle for the worst
2173  * case for async commits.)
2174  *
2175  * This routine is invoked periodically by the background walwriter process.
2176  */
2177 void
2178 XLogBackgroundFlush(void)
2179 {
2180         XLogRecPtr      WriteRqstPtr;
2181         bool            flexible = true;
2182
2183         /* XLOG doesn't need flushing during recovery */
2184         if (RecoveryInProgress())
2185                 return;
2186
2187         /* read LogwrtResult and update local state */
2188         {
2189                 /* use volatile pointer to prevent code rearrangement */
2190                 volatile XLogCtlData *xlogctl = XLogCtl;
2191
2192                 SpinLockAcquire(&xlogctl->info_lck);
2193                 LogwrtResult = xlogctl->LogwrtResult;
2194                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2195                 SpinLockRelease(&xlogctl->info_lck);
2196         }
2197
2198         /* back off to last completed page boundary */
2199         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
2200
2201         /* if we have already flushed that far, consider async commit records */
2202         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2203         {
2204                 /* use volatile pointer to prevent code rearrangement */
2205                 volatile XLogCtlData *xlogctl = XLogCtl;
2206
2207                 SpinLockAcquire(&xlogctl->info_lck);
2208                 WriteRqstPtr = xlogctl->asyncXactLSN;
2209                 SpinLockRelease(&xlogctl->info_lck);
2210                 flexible = false;               /* ensure it all gets written */
2211         }
2212
2213         /*
2214          * If already known flushed, we're done. Just need to check if we are
2215          * holding an open file handle to a logfile that's no longer in use,
2216          * preventing the file from being deleted.
2217          */
2218         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2219         {
2220                 if (openLogFile >= 0)
2221                 {
2222                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
2223                         {
2224                                 XLogFileClose();
2225                         }
2226                 }
2227                 return;
2228         }
2229
2230 #ifdef WAL_DEBUG
2231         if (XLOG_DEBUG)
2232                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2233                          WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
2234                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
2235                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2236 #endif
2237
2238         START_CRIT_SECTION();
2239
2240         /* now wait for the write lock */
2241         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2242         LogwrtResult = XLogCtl->LogwrtResult;
2243         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2244         {
2245                 XLogwrtRqst WriteRqst;
2246
2247                 WriteRqst.Write = WriteRqstPtr;
2248                 WriteRqst.Flush = WriteRqstPtr;
2249                 XLogWrite(WriteRqst, flexible, false);
2250         }
2251         LWLockRelease(WALWriteLock);
2252
2253         END_CRIT_SECTION();
2254 }
2255
2256 /*
2257  * Test whether XLOG data has been flushed up to (at least) the given position.
2258  *
2259  * Returns true if a flush is still needed.  (It may be that someone else
2260  * is already in process of flushing that far, however.)
2261  */
2262 bool
2263 XLogNeedsFlush(XLogRecPtr record)
2264 {
2265         /*
2266          * During recovery, we don't flush WAL but update minRecoveryPoint
2267          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2268          * would need to be updated.
2269          */
2270         if (RecoveryInProgress())
2271         {
2272                 /* Quick exit if already known updated */
2273                 if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
2274                         return false;
2275
2276                 /*
2277                  * Update local copy of minRecoveryPoint. But if the lock is busy,
2278                  * just return a conservative guess.
2279                  */
2280                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
2281                         return true;
2282                 minRecoveryPoint = ControlFile->minRecoveryPoint;
2283                 LWLockRelease(ControlFileLock);
2284
2285                 /*
2286                  * An invalid minRecoveryPoint means that we need to recover all the
2287                  * WAL, i.e., we're doing crash recovery.  We never modify the control
2288                  * file's value in that case, so we can short-circuit future checks
2289                  * here too.
2290                  */
2291                 if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
2292                         updateMinRecoveryPoint = false;
2293
2294                 /* check again */
2295                 if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
2296                         return false;
2297                 else
2298                         return true;
2299         }
2300
2301         /* Quick exit if already known flushed */
2302         if (XLByteLE(record, LogwrtResult.Flush))
2303                 return false;
2304
2305         /* read LogwrtResult and update local state */
2306         {
2307                 /* use volatile pointer to prevent code rearrangement */
2308                 volatile XLogCtlData *xlogctl = XLogCtl;
2309
2310                 SpinLockAcquire(&xlogctl->info_lck);
2311                 LogwrtResult = xlogctl->LogwrtResult;
2312                 SpinLockRelease(&xlogctl->info_lck);
2313         }
2314
2315         /* check again */
2316         if (XLByteLE(record, LogwrtResult.Flush))
2317                 return false;
2318
2319         return true;
2320 }
2321
2322 /*
2323  * Create a new XLOG file segment, or open a pre-existing one.
2324  *
2325  * log, seg: identify segment to be created/opened.
2326  *
2327  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2328  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2329  * file was used.
2330  *
2331  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2332  * place.  This should be TRUE except during bootstrap log creation.  The
2333  * caller must *not* hold the lock at call.
2334  *
2335  * Returns FD of opened file.
2336  *
2337  * Note: errors here are ERROR not PANIC because we might or might not be
2338  * inside a critical section (eg, during checkpoint there is no reason to
2339  * take down the system on failure).  They will promote to PANIC if we are
2340  * in a critical section.
2341  */
2342 int
2343 XLogFileInit(uint32 log, uint32 seg,
2344                          bool *use_existent, bool use_lock)
2345 {
2346         char            path[MAXPGPATH];
2347         char            tmppath[MAXPGPATH];
2348         char       *zbuffer;
2349         uint32          installed_log;
2350         uint32          installed_seg;
2351         int                     max_advance;
2352         int                     fd;
2353         int                     nbytes;
2354
2355         XLogFilePath(path, ThisTimeLineID, log, seg);
2356
2357         /*
2358          * Try to use existent file (checkpoint maker may have created it already)
2359          */
2360         if (*use_existent)
2361         {
2362                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2363                                                    S_IRUSR | S_IWUSR);
2364                 if (fd < 0)
2365                 {
2366                         if (errno != ENOENT)
2367                                 ereport(ERROR,
2368                                                 (errcode_for_file_access(),
2369                                                  errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2370                                                                 path, log, seg)));
2371                 }
2372                 else
2373                         return fd;
2374         }
2375
2376         /*
2377          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2378          * another process is doing the same thing.  If so, we will end up
2379          * pre-creating an extra log segment.  That seems OK, and better than
2380          * holding the lock throughout this lengthy process.
2381          */
2382         elog(DEBUG2, "creating and filling new WAL file");
2383
2384         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2385
2386         unlink(tmppath);
2387
2388         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2389         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2390                                            S_IRUSR | S_IWUSR);
2391         if (fd < 0)
2392                 ereport(ERROR,
2393                                 (errcode_for_file_access(),
2394                                  errmsg("could not create file \"%s\": %m", tmppath)));
2395
2396         /*
2397          * Zero-fill the file.  We have to do this the hard way to ensure that all
2398          * the file space has really been allocated --- on platforms that allow
2399          * "holes" in files, just seeking to the end doesn't allocate intermediate
2400          * space.  This way, we know that we have all the space and (after the
2401          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2402          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2403          * log file.
2404          *
2405          * Note: palloc zbuffer, instead of just using a local char array, to
2406          * ensure it is reasonably well-aligned; this may save a few cycles
2407          * transferring data to the kernel.
2408          */
2409         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2410         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2411         {
2412                 errno = 0;
2413                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2414                 {
2415                         int                     save_errno = errno;
2416
2417                         /*
2418                          * If we fail to make the file, delete it to release disk space
2419                          */
2420                         unlink(tmppath);
2421                         /* if write didn't set errno, assume problem is no disk space */
2422                         errno = save_errno ? save_errno : ENOSPC;
2423
2424                         ereport(ERROR,
2425                                         (errcode_for_file_access(),
2426                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2427                 }
2428         }
2429         pfree(zbuffer);
2430
2431         if (pg_fsync(fd) != 0)
2432                 ereport(ERROR,
2433                                 (errcode_for_file_access(),
2434                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2435
2436         if (close(fd))
2437                 ereport(ERROR,
2438                                 (errcode_for_file_access(),
2439                                  errmsg("could not close file \"%s\": %m", tmppath)));
2440
2441         /*
2442          * Now move the segment into place with its final name.
2443          *
2444          * If caller didn't want to use a pre-existing file, get rid of any
2445          * pre-existing file.  Otherwise, cope with possibility that someone else
2446          * has created the file while we were filling ours: if so, use ours to
2447          * pre-create a future log segment.
2448          */
2449         installed_log = log;
2450         installed_seg = seg;
2451         max_advance = XLOGfileslop;
2452         if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
2453                                                                 *use_existent, &max_advance,
2454                                                                 use_lock))
2455         {
2456                 /*
2457                  * No need for any more future segments, or InstallXLogFileSegment()
2458                  * failed to rename the file into place. If the rename failed, opening
2459                  * the file below will fail.
2460                  */
2461                 unlink(tmppath);
2462         }
2463
2464         /* Set flag to tell caller there was no existent file */
2465         *use_existent = false;
2466
2467         /* Now open original target segment (might not be file I just made) */
2468         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2469                                            S_IRUSR | S_IWUSR);
2470         if (fd < 0)
2471                 ereport(ERROR,
2472                                 (errcode_for_file_access(),
2473                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2474                                   path, log, seg)));
2475
2476         elog(DEBUG2, "done creating and filling new WAL file");
2477
2478         return fd;
2479 }
2480
2481 /*
2482  * Create a new XLOG file segment by copying a pre-existing one.
2483  *
2484  * log, seg: identify segment to be created.
2485  *
2486  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2487  *              a different timeline)
2488  *
2489  * Currently this is only used during recovery, and so there are no locking
2490  * considerations.      But we should be just as tense as XLogFileInit to avoid
2491  * emplacing a bogus file.
2492  */
2493 static void
2494 XLogFileCopy(uint32 log, uint32 seg,
2495                          TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
2496 {
2497         char            path[MAXPGPATH];
2498         char            tmppath[MAXPGPATH];
2499         char            buffer[XLOG_BLCKSZ];
2500         int                     srcfd;
2501         int                     fd;
2502         int                     nbytes;
2503
2504         /*
2505          * Open the source file
2506          */
2507         XLogFilePath(path, srcTLI, srclog, srcseg);
2508         srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2509         if (srcfd < 0)
2510                 ereport(ERROR,
2511                                 (errcode_for_file_access(),
2512                                  errmsg("could not open file \"%s\": %m", path)));
2513
2514         /*
2515          * Copy into a temp file name.
2516          */
2517         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2518
2519         unlink(tmppath);
2520
2521         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2522         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2523                                            S_IRUSR | S_IWUSR);
2524         if (fd < 0)
2525                 ereport(ERROR,
2526                                 (errcode_for_file_access(),
2527                                  errmsg("could not create file \"%s\": %m", tmppath)));
2528
2529         /*
2530          * Do the data copying.
2531          */
2532         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2533         {
2534                 errno = 0;
2535                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2536                 {
2537                         if (errno != 0)
2538                                 ereport(ERROR,
2539                                                 (errcode_for_file_access(),
2540                                                  errmsg("could not read file \"%s\": %m", path)));
2541                         else
2542                                 ereport(ERROR,
2543                                                 (errmsg("not enough data in file \"%s\"", path)));
2544                 }
2545                 errno = 0;
2546                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2547                 {
2548                         int                     save_errno = errno;
2549
2550                         /*
2551                          * If we fail to make the file, delete it to release disk space
2552                          */
2553                         unlink(tmppath);
2554                         /* if write didn't set errno, assume problem is no disk space */
2555                         errno = save_errno ? save_errno : ENOSPC;
2556
2557                         ereport(ERROR,
2558                                         (errcode_for_file_access(),
2559                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2560                 }
2561         }
2562
2563         if (pg_fsync(fd) != 0)
2564                 ereport(ERROR,
2565                                 (errcode_for_file_access(),
2566                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2567
2568         if (close(fd))
2569                 ereport(ERROR,
2570                                 (errcode_for_file_access(),
2571                                  errmsg("could not close file \"%s\": %m", tmppath)));
2572
2573         close(srcfd);
2574
2575         /*
2576          * Now move the segment into place with its final name.
2577          */
2578         if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2579                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2580 }
2581
2582 /*
2583  * Install a new XLOG segment file as a current or future log segment.
2584  *
2585  * This is used both to install a newly-created segment (which has a temp
2586  * filename while it's being created) and to recycle an old segment.
2587  *
2588  * *log, *seg: identify segment to install as (or first possible target).
2589  * When find_free is TRUE, these are modified on return to indicate the
2590  * actual installation location or last segment searched.
2591  *
2592  * tmppath: initial name of file to install.  It will be renamed into place.
2593  *
2594  * find_free: if TRUE, install the new segment at the first empty log/seg
2595  * number at or after the passed numbers.  If FALSE, install the new segment
2596  * exactly where specified, deleting any existing segment file there.
2597  *
2598  * *max_advance: maximum number of log/seg slots to advance past the starting
2599  * point.  Fail if no free slot is found in this range.  On return, reduced
2600  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2601  * when find_free is FALSE.)
2602  *
2603  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2604  * place.  This should be TRUE except during bootstrap log creation.  The
2605  * caller must *not* hold the lock at call.
2606  *
2607  * Returns TRUE if the file was installed successfully.  FALSE indicates that
2608  * max_advance limit was exceeded, or an error occurred while renaming the
2609  * file into place.
2610  */
2611 static bool
2612 InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
2613                                            bool find_free, int *max_advance,
2614                                            bool use_lock)
2615 {
2616         char            path[MAXPGPATH];
2617         struct stat stat_buf;
2618
2619         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2620
2621         /*
2622          * We want to be sure that only one process does this at a time.
2623          */
2624         if (use_lock)
2625                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2626
2627         if (!find_free)
2628         {
2629                 /* Force installation: get rid of any pre-existing segment file */
2630                 unlink(path);
2631         }
2632         else
2633         {
2634                 /* Find a free slot to put it in */
2635                 while (stat(path, &stat_buf) == 0)
2636                 {
2637                         if (*max_advance <= 0)
2638                         {
2639                                 /* Failed to find a free slot within specified range */
2640                                 if (use_lock)
2641                                         LWLockRelease(ControlFileLock);
2642                                 return false;
2643                         }
2644                         NextLogSeg(*log, *seg);
2645                         (*max_advance)--;
2646                         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2647                 }
2648         }
2649
2650         /*
2651          * Prefer link() to rename() here just to be really sure that we don't
2652          * overwrite an existing logfile.  However, there shouldn't be one, so
2653          * rename() is an acceptable substitute except for the truly paranoid.
2654          */
2655 #if HAVE_WORKING_LINK
2656         if (link(tmppath, path) < 0)
2657         {
2658                 if (use_lock)
2659                         LWLockRelease(ControlFileLock);
2660                 ereport(LOG,
2661                                 (errcode_for_file_access(),
2662                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2663                                                 tmppath, path, *log, *seg)));
2664                 return false;
2665         }
2666         unlink(tmppath);
2667 #else
2668         if (rename(tmppath, path) < 0)
2669         {
2670                 if (use_lock)
2671                         LWLockRelease(ControlFileLock);
2672                 ereport(LOG,
2673                                 (errcode_for_file_access(),
2674                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2675                                                 tmppath, path, *log, *seg)));
2676                 return false;
2677         }
2678 #endif
2679
2680         if (use_lock)
2681                 LWLockRelease(ControlFileLock);
2682
2683         return true;
2684 }
2685
2686 /*
2687  * Open a pre-existing logfile segment for writing.
2688  */
2689 int
2690 XLogFileOpen(uint32 log, uint32 seg)
2691 {
2692         char            path[MAXPGPATH];
2693         int                     fd;
2694
2695         XLogFilePath(path, ThisTimeLineID, log, seg);
2696
2697         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2698                                            S_IRUSR | S_IWUSR);
2699         if (fd < 0)
2700                 ereport(PANIC,
2701                                 (errcode_for_file_access(),
2702                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2703                                   path, log, seg)));
2704
2705         return fd;
2706 }
2707
2708 /*
2709  * Open a logfile segment for reading (during recovery).
2710  *
2711  * If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
2712  * Otherwise, it's assumed to be already available in pg_xlog.
2713  */
2714 static int
2715 XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
2716                          int source, bool notfoundOk)
2717 {
2718         char            xlogfname[MAXFNAMELEN];
2719         char            activitymsg[MAXFNAMELEN + 16];
2720         char            path[MAXPGPATH];
2721         int                     fd;
2722
2723         XLogFileName(xlogfname, tli, log, seg);
2724
2725         switch (source)
2726         {
2727                 case XLOG_FROM_ARCHIVE:
2728                         /* Report recovery progress in PS display */
2729                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2730                                          xlogfname);
2731                         set_ps_display(activitymsg, false);
2732
2733                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2734                                                                                                           "RECOVERYXLOG",
2735                                                                                                           XLogSegSize);
2736                         if (!restoredFromArchive)
2737                                 return -1;
2738                         break;
2739
2740                 case XLOG_FROM_PG_XLOG:
2741                 case XLOG_FROM_STREAM:
2742                         XLogFilePath(path, tli, log, seg);
2743                         restoredFromArchive = false;
2744                         break;
2745
2746                 default:
2747                         elog(ERROR, "invalid XLogFileRead source %d", source);
2748         }
2749
2750         /*
2751          * If the segment was fetched from archival storage, replace
2752          * the existing xlog segment (if any) with the archival version.
2753          */
2754         if (source == XLOG_FROM_ARCHIVE)
2755         {
2756                 /* use volatile pointer to prevent code rearrangement */
2757                 volatile XLogCtlData *xlogctl = XLogCtl;
2758                 XLogRecPtr              endptr;
2759                 char                    xlogfpath[MAXPGPATH];
2760                 bool                    reload = false;
2761                 struct stat             statbuf;
2762
2763                 XLogFilePath(xlogfpath, tli, log, seg);
2764                 if (stat(xlogfpath, &statbuf) == 0)
2765                 {
2766                         if (unlink(xlogfpath) != 0)
2767                                 ereport(FATAL,
2768                                                 (errcode_for_file_access(),
2769                                                  errmsg("could not remove file \"%s\": %m",
2770                                                                 xlogfpath)));
2771                         reload = true;
2772                 }
2773
2774                 if (rename(path, xlogfpath) < 0)
2775                         ereport(ERROR,
2776                                 (errcode_for_file_access(),
2777                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
2778                                                 path, xlogfpath)));
2779
2780                 /*
2781                  * If the existing segment was replaced, since walsenders might have
2782                  * it open, request them to reload a currently-open segment.
2783                  */
2784                 if (reload)
2785                         WalSndRqstFileReload();
2786
2787                 /*
2788                  * Calculate the end location of the restored WAL file and save it in
2789                  * shmem. It's used as current standby flush position, and cascading
2790                  * walsenders try to send WAL records up to this location.
2791                  */
2792                 endptr.xlogid = log;
2793                 endptr.xrecoff = seg * XLogSegSize;
2794                 XLByteAdvance(endptr, XLogSegSize);
2795
2796                 SpinLockAcquire(&xlogctl->info_lck);
2797                 xlogctl->restoreLastRecPtr = endptr;
2798                 SpinLockRelease(&xlogctl->info_lck);
2799
2800                 /* Signal walsender that new WAL has arrived */
2801                 if (AllowCascadeReplication())
2802                         WalSndWakeup();
2803         }
2804
2805         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2806         if (fd >= 0)
2807         {
2808                 /* Success! */
2809                 curFileTLI = tli;
2810
2811                 /* Report recovery progress in PS display */
2812                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2813                                  xlogfname);
2814                 set_ps_display(activitymsg, false);
2815
2816                 /* Track source of data in assorted state variables */
2817                 readSource = source;
2818                 XLogReceiptSource = source;
2819                 /* In FROM_STREAM case, caller tracks receipt time, not me */
2820                 if (source != XLOG_FROM_STREAM)
2821                         XLogReceiptTime = GetCurrentTimestamp();
2822
2823                 return fd;
2824         }
2825         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
2826                 ereport(PANIC,
2827                                 (errcode_for_file_access(),
2828                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2829                                   path, log, seg)));
2830         return -1;
2831 }
2832
2833 /*
2834  * Open a logfile segment for reading (during recovery).
2835  *
2836  * This version searches for the segment with any TLI listed in expectedTLIs.
2837  */
2838 static int
2839 XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, int sources)
2840 {
2841         char            path[MAXPGPATH];
2842         ListCell   *cell;
2843         int                     fd;
2844
2845         /*
2846          * Loop looking for a suitable timeline ID: we might need to read any of
2847          * the timelines listed in expectedTLIs.
2848          *
2849          * We expect curFileTLI on entry to be the TLI of the preceding file in
2850          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2851          * to go backwards; this prevents us from picking up the wrong file when a
2852          * parent timeline extends to higher segment numbers than the child we
2853          * want to read.
2854          */
2855         foreach(cell, expectedTLIs)
2856         {
2857                 TimeLineID      tli = (TimeLineID) lfirst_int(cell);
2858
2859                 if (tli < curFileTLI)
2860                         break;                          /* don't bother looking at too-old TLIs */
2861
2862                 if (sources & XLOG_FROM_ARCHIVE)
2863                 {
2864                         fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_ARCHIVE, true);
2865                         if (fd != -1)
2866                         {
2867                                 elog(DEBUG1, "got WAL segment from archive");
2868                                 return fd;
2869                         }
2870                 }
2871
2872                 if (sources & XLOG_FROM_PG_XLOG)
2873                 {
2874                         fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_PG_XLOG, true);
2875                         if (fd != -1)
2876                                 return fd;
2877                 }
2878         }
2879
2880         /* Couldn't find it.  For simplicity, complain about front timeline */
2881         XLogFilePath(path, recoveryTargetTLI, log, seg);
2882         errno = ENOENT;
2883         ereport(emode,
2884                         (errcode_for_file_access(),
2885                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2886                                   path, log, seg)));
2887         return -1;
2888 }
2889
2890 /*
2891  * Close the current logfile segment for writing.
2892  */
2893 static void
2894 XLogFileClose(void)
2895 {
2896         Assert(openLogFile >= 0);
2897
2898         /*
2899          * WAL segment files will not be re-read in normal operation, so we advise
2900          * the OS to release any cached pages.  But do not do so if WAL archiving
2901          * or streaming is active, because archiver and walsender process could
2902          * use the cache to read the WAL segment.
2903          */
2904 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2905         if (!XLogIsNeeded())
2906                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2907 #endif
2908
2909         if (close(openLogFile))
2910                 ereport(PANIC,
2911                                 (errcode_for_file_access(),
2912                                  errmsg("could not close log file %u, segment %u: %m",
2913                                                 openLogId, openLogSeg)));
2914         openLogFile = -1;
2915 }
2916
2917 /*
2918  * Attempt to retrieve the specified file from off-line archival storage.
2919  * If successful, fill "path" with its complete path (note that this will be
2920  * a temp file name that doesn't follow the normal naming convention), and
2921  * return TRUE.
2922  *
2923  * If not successful, fill "path" with the name of the normal on-line file
2924  * (which may or may not actually exist, but we'll try to use it), and return
2925  * FALSE.
2926  *
2927  * For fixed-size files, the caller may pass the expected size as an
2928  * additional crosscheck on successful recovery.  If the file size is not
2929  * known, set expectedSize = 0.
2930  */
2931 static bool
2932 RestoreArchivedFile(char *path, const char *xlogfname,
2933                                         const char *recovername, off_t expectedSize)
2934 {
2935         char            xlogpath[MAXPGPATH];
2936         char            xlogRestoreCmd[MAXPGPATH];
2937         char            lastRestartPointFname[MAXPGPATH];
2938         char       *dp;
2939         char       *endp;
2940         const char *sp;
2941         int                     rc;
2942         bool            signaled;
2943         struct stat stat_buf;
2944         uint32          restartLog;
2945         uint32          restartSeg;
2946
2947         /* In standby mode, restore_command might not be supplied */
2948         if (recoveryRestoreCommand == NULL)
2949                 goto not_available;
2950
2951         /*
2952          * When doing archive recovery, we always prefer an archived log file even
2953          * if a file of the same name exists in XLOGDIR.  The reason is that the
2954          * file in XLOGDIR could be an old, un-filled or partly-filled version
2955          * that was copied and restored as part of backing up $PGDATA.
2956          *
2957          * We could try to optimize this slightly by checking the local copy
2958          * lastchange timestamp against the archived copy, but we have no API to
2959          * do this, nor can we guarantee that the lastchange timestamp was
2960          * preserved correctly when we copied to archive. Our aim is robustness,
2961          * so we elect not to do this.
2962          *
2963          * If we cannot obtain the log file from the archive, however, we will try
2964          * to use the XLOGDIR file if it exists.  This is so that we can make use
2965          * of log segments that weren't yet transferred to the archive.
2966          *
2967          * Notice that we don't actually overwrite any files when we copy back
2968          * from archive because the recoveryRestoreCommand may inadvertently
2969          * restore inappropriate xlogs, or they may be corrupt, so we may wish to
2970          * fallback to the segments remaining in current XLOGDIR later. The
2971          * copy-from-archive filename is always the same, ensuring that we don't
2972          * run out of disk space on long recoveries.
2973          */
2974         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2975
2976         /*
2977          * Make sure there is no existing file named recovername.
2978          */
2979         if (stat(xlogpath, &stat_buf) != 0)
2980         {
2981                 if (errno != ENOENT)
2982                         ereport(FATAL,
2983                                         (errcode_for_file_access(),
2984                                          errmsg("could not stat file \"%s\": %m",
2985                                                         xlogpath)));
2986         }
2987         else
2988         {
2989                 if (unlink(xlogpath) != 0)
2990                         ereport(FATAL,
2991                                         (errcode_for_file_access(),
2992                                          errmsg("could not remove file \"%s\": %m",
2993                                                         xlogpath)));
2994         }
2995
2996         /*
2997          * Calculate the archive file cutoff point for use during log shipping
2998          * replication. All files earlier than this point can be deleted from the
2999          * archive, though there is no requirement to do so.
3000          *
3001          * We initialise this with the filename of an InvalidXLogRecPtr, which
3002          * will prevent the deletion of any WAL files from the archive because of
3003          * the alphabetic sorting property of WAL filenames.
3004          *
3005          * Once we have successfully located the redo pointer of the checkpoint
3006          * from which we start recovery we never request a file prior to the redo
3007          * pointer of the last restartpoint. When redo begins we know that we have
3008          * successfully located it, so there is no need for additional status
3009          * flags to signify the point when we can begin deleting WAL files from
3010          * the archive.
3011          */
3012         if (InRedo)
3013         {
3014                 XLByteToSeg(ControlFile->checkPointCopy.redo,
3015                                         restartLog, restartSeg);
3016                 XLogFileName(lastRestartPointFname,
3017                                          ControlFile->checkPointCopy.ThisTimeLineID,
3018                                          restartLog, restartSeg);
3019                 /* we shouldn't need anything earlier than last restart point */
3020                 Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
3021         }
3022         else
3023                 XLogFileName(lastRestartPointFname, 0, 0, 0);
3024
3025         /*
3026          * construct the command to be executed
3027          */
3028         dp = xlogRestoreCmd;
3029         endp = xlogRestoreCmd + MAXPGPATH - 1;
3030         *endp = '\0';
3031
3032         for (sp = recoveryRestoreCommand; *sp; sp++)
3033         {
3034                 if (*sp == '%')
3035                 {
3036                         switch (sp[1])
3037                         {
3038                                 case 'p':
3039                                         /* %p: relative path of target file */
3040                                         sp++;
3041                                         StrNCpy(dp, xlogpath, endp - dp);
3042                                         make_native_path(dp);
3043                                         dp += strlen(dp);
3044                                         break;
3045                                 case 'f':
3046                                         /* %f: filename of desired file */
3047                                         sp++;
3048                                         StrNCpy(dp, xlogfname, endp - dp);
3049                                         dp += strlen(dp);
3050                                         break;
3051                                 case 'r':
3052                                         /* %r: filename of last restartpoint */
3053                                         sp++;
3054                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
3055                                         dp += strlen(dp);
3056                                         break;
3057                                 case '%':
3058                                         /* convert %% to a single % */
3059                                         sp++;
3060                                         if (dp < endp)
3061                                                 *dp++ = *sp;
3062                                         break;
3063                                 default:
3064                                         /* otherwise treat the % as not special */
3065                                         if (dp < endp)
3066                                                 *dp++ = *sp;
3067                                         break;
3068                         }
3069                 }
3070                 else
3071                 {
3072                         if (dp < endp)
3073                                 *dp++ = *sp;
3074                 }
3075         }
3076         *dp = '\0';
3077
3078         ereport(DEBUG3,
3079                         (errmsg_internal("executing restore command \"%s\"",
3080                                                          xlogRestoreCmd)));
3081
3082         /*
3083          * Check signals before restore command and reset afterwards.
3084          */
3085         PreRestoreCommand();
3086
3087         /*
3088          * Copy xlog from archival storage to XLOGDIR
3089          */
3090         rc = system(xlogRestoreCmd);
3091
3092         PostRestoreCommand();
3093
3094         if (rc == 0)
3095         {
3096                 /*
3097                  * command apparently succeeded, but let's make sure the file is
3098                  * really there now and has the correct size.
3099                  */
3100                 if (stat(xlogpath, &stat_buf) == 0)
3101                 {
3102                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
3103                         {
3104                                 int                     elevel;
3105
3106                                 /*
3107                                  * If we find a partial file in standby mode, we assume it's
3108                                  * because it's just being copied to the archive, and keep
3109                                  * trying.
3110                                  *
3111                                  * Otherwise treat a wrong-sized file as FATAL to ensure the
3112                                  * DBA would notice it, but is that too strong? We could try
3113                                  * to plow ahead with a local copy of the file ... but the
3114                                  * problem is that there probably isn't one, and we'd
3115                                  * incorrectly conclude we've reached the end of WAL and we're
3116                                  * done recovering ...
3117                                  */
3118                                 if (StandbyMode && stat_buf.st_size < expectedSize)
3119                                         elevel = DEBUG1;
3120                                 else
3121                                         elevel = FATAL;
3122                                 ereport(elevel,
3123                                                 (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
3124                                                                 xlogfname,
3125                                                                 (unsigned long) stat_buf.st_size,
3126                                                                 (unsigned long) expectedSize)));
3127                                 return false;
3128                         }
3129                         else
3130                         {
3131                                 ereport(LOG,
3132                                                 (errmsg("restored log file \"%s\" from archive",
3133                                                                 xlogfname)));
3134                                 strcpy(path, xlogpath);
3135                                 return true;
3136                         }
3137                 }
3138                 else
3139                 {
3140                         /* stat failed */
3141                         if (errno != ENOENT)
3142                                 ereport(FATAL,
3143                                                 (errcode_for_file_access(),
3144                                                  errmsg("could not stat file \"%s\": %m",
3145                                                                 xlogpath)));
3146                 }
3147         }
3148
3149         /*
3150          * Remember, we rollforward UNTIL the restore fails so failure here is
3151          * just part of the process... that makes it difficult to determine
3152          * whether the restore failed because there isn't an archive to restore,
3153          * or because the administrator has specified the restore program
3154          * incorrectly.  We have to assume the former.
3155          *
3156          * However, if the failure was due to any sort of signal, it's best to
3157          * punt and abort recovery.  (If we "return false" here, upper levels will
3158          * assume that recovery is complete and start up the database!) It's
3159          * essential to abort on child SIGINT and SIGQUIT, because per spec
3160          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
3161          * those it's a good bet we should have gotten it too.
3162          *
3163          * On SIGTERM, assume we have received a fast shutdown request, and exit
3164          * cleanly. It's pure chance whether we receive the SIGTERM first, or the
3165          * child process. If we receive it first, the signal handler will call
3166          * proc_exit, otherwise we do it here. If we or the child process received
3167          * SIGTERM for any other reason than a fast shutdown request, postmaster
3168          * will perform an immediate shutdown when it sees us exiting
3169          * unexpectedly.
3170          *
3171          * Per the Single Unix Spec, shells report exit status > 128 when a called
3172          * command died on a signal.  Also, 126 and 127 are used to report
3173          * problems such as an unfindable command; treat those as fatal errors
3174          * too.
3175          */
3176         if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
3177                 proc_exit(1);
3178
3179         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
3180
3181         ereport(signaled ? FATAL : DEBUG2,
3182                 (errmsg("could not restore file \"%s\" from archive: return code %d",
3183                                 xlogfname, rc)));
3184
3185 not_available:
3186
3187         /*
3188          * if an archived file is not available, there might still be a version of
3189          * this file in XLOGDIR, so return that as the filename to open.
3190          *
3191          * In many recovery scenarios we expect this to fail also, but if so that
3192          * just means we've reached the end of WAL.
3193          */
3194         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3195         return false;
3196 }
3197
3198 /*
3199  * Attempt to execute an external shell command during recovery.
3200  *
3201  * 'command' is the shell command to be executed, 'commandName' is a
3202  * human-readable name describing the command emitted in the logs. If
3203  * 'failOnSignal' is true and the command is killed by a signal, a FATAL
3204  * error is thrown. Otherwise a WARNING is emitted.
3205  *
3206  * This is currently used for recovery_end_command and archive_cleanup_command.
3207  */
3208 static void
3209 ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
3210 {
3211         char            xlogRecoveryCmd[MAXPGPATH];
3212         char            lastRestartPointFname[MAXPGPATH];
3213         char       *dp;
3214         char       *endp;
3215         const char *sp;
3216         int                     rc;
3217         bool            signaled;
3218         uint32          restartLog;
3219         uint32          restartSeg;
3220
3221         Assert(command && commandName);
3222
3223         /*
3224          * Calculate the archive file cutoff point for use during log shipping
3225          * replication. All files earlier than this point can be deleted from the
3226          * archive, though there is no requirement to do so.
3227          */
3228         LWLockAcquire(ControlFileLock, LW_SHARED);
3229         XLByteToSeg(ControlFile->checkPointCopy.redo,
3230                                 restartLog, restartSeg);
3231         XLogFileName(lastRestartPointFname,
3232                                  ControlFile->checkPointCopy.ThisTimeLineID,
3233                                  restartLog, restartSeg);
3234         LWLockRelease(ControlFileLock);
3235
3236         /*
3237          * construct the command to be executed
3238          */
3239         dp = xlogRecoveryCmd;
3240         endp = xlogRecoveryCmd + MAXPGPATH - 1;
3241         *endp = '\0';
3242
3243         for (sp = command; *sp; sp++)
3244         {
3245                 if (*sp == '%')
3246                 {
3247                         switch (sp[1])
3248                         {
3249                                 case 'r':
3250                                         /* %r: filename of last restartpoint */
3251                                         sp++;
3252                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
3253                                         dp += strlen(dp);
3254                                         break;
3255                                 case '%':
3256                                         /* convert %% to a single % */
3257                                         sp++;
3258                                         if (dp < endp)
3259                                                 *dp++ = *sp;
3260                                         break;
3261                                 default:
3262                                         /* otherwise treat the % as not special */
3263                                         if (dp < endp)
3264                                                 *dp++ = *sp;
3265                                         break;
3266                         }
3267                 }
3268                 else
3269                 {
3270                         if (dp < endp)
3271                                 *dp++ = *sp;
3272                 }
3273         }
3274         *dp = '\0';
3275
3276         ereport(DEBUG3,
3277                         (errmsg_internal("executing %s \"%s\"", commandName, command)));
3278
3279         /*
3280          * execute the constructed command
3281          */
3282         rc = system(xlogRecoveryCmd);
3283         if (rc != 0)
3284         {
3285                 /*
3286                  * If the failure was due to any sort of signal, it's best to punt and
3287                  * abort recovery. See also detailed comments on signals in
3288                  * RestoreArchivedFile().
3289                  */
3290                 signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
3291
3292                 ereport((signaled && failOnSignal) ? FATAL : WARNING,
3293                 /*------
3294                    translator: First %s represents a recovery.conf parameter name like
3295                   "recovery_end_command", and the 2nd is the value of that parameter. */
3296                                 (errmsg("%s \"%s\": return code %d", commandName,
3297                                                 command, rc)));
3298         }
3299 }
3300
3301 /*
3302  * Preallocate log files beyond the specified log endpoint.
3303  *
3304  * XXX this is currently extremely conservative, since it forces only one
3305  * future log segment to exist, and even that only if we are 75% done with
3306  * the current one.  This is only appropriate for very low-WAL-volume systems.
3307  * High-volume systems will be OK once they've built up a sufficient set of
3308  * recycled log segments, but the startup transient is likely to include
3309  * a lot of segment creations by foreground processes, which is not so good.
3310  */
3311 static void
3312 PreallocXlogFiles(XLogRecPtr endptr)
3313 {
3314         uint32          _logId;
3315         uint32          _logSeg;
3316         int                     lf;
3317         bool            use_existent;
3318
3319         XLByteToPrevSeg(endptr, _logId, _logSeg);
3320         if ((endptr.xrecoff - 1) % XLogSegSize >=
3321                 (uint32) (0.75 * XLogSegSize))
3322         {
3323                 NextLogSeg(_logId, _logSeg);
3324                 use_existent = true;
3325                 lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
3326                 close(lf);
3327                 if (!use_existent)
3328                         CheckpointStats.ckpt_segs_added++;
3329         }
3330 }
3331
3332 /*
3333  * Get the log/seg of the latest removed or recycled WAL segment.
3334  * Returns 0/0 if no WAL segments have been removed since startup.
3335  */
3336 void
3337 XLogGetLastRemoved(uint32 *log, uint32 *seg)
3338 {
3339         /* use volatile pointer to prevent code rearrangement */
3340         volatile XLogCtlData *xlogctl = XLogCtl;
3341
3342         SpinLockAcquire(&xlogctl->info_lck);
3343         *log = xlogctl->lastRemovedLog;
3344         *seg = xlogctl->lastRemovedSeg;
3345         SpinLockRelease(&xlogctl->info_lck);
3346 }
3347
3348 /*
3349  * Update the last removed log/seg pointer in shared memory, to reflect
3350  * that the given XLOG file has been removed.
3351  */
3352 static void
3353 UpdateLastRemovedPtr(char *filename)
3354 {
3355         /* use volatile pointer to prevent code rearrangement */
3356         volatile XLogCtlData *xlogctl = XLogCtl;
3357         uint32          tli,
3358                                 log,
3359                                 seg;
3360
3361         XLogFromFileName(filename, &tli, &log, &seg);
3362
3363         SpinLockAcquire(&xlogctl->info_lck);
3364         if (log > xlogctl->lastRemovedLog ||
3365                 (log == xlogctl->lastRemovedLog && seg > xlogctl->lastRemovedSeg))
3366         {
3367                 xlogctl->lastRemovedLog = log;
3368                 xlogctl->lastRemovedSeg = seg;
3369         }
3370         SpinLockRelease(&xlogctl->info_lck);
3371 }
3372
3373 /*
3374  * Recycle or remove all log files older or equal to passed log/seg#
3375  *
3376  * endptr is current (or recent) end of xlog; this is used to determine
3377  * whether we want to recycle rather than delete no-longer-wanted log files.
3378  */
3379 static void
3380 RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
3381 {
3382         uint32          endlogId;
3383         uint32          endlogSeg;
3384         int                     max_advance;
3385         DIR                *xldir;
3386         struct dirent *xlde;
3387         char            lastoff[MAXFNAMELEN];
3388         char            path[MAXPGPATH];
3389
3390 #ifdef WIN32
3391         char            newpath[MAXPGPATH];
3392 #endif
3393         struct stat statbuf;
3394
3395         /*
3396          * Initialize info about where to try to recycle to.  We allow recycling
3397          * segments up to XLOGfileslop segments beyond the current XLOG location.
3398          */
3399         XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3400         max_advance = XLOGfileslop;
3401
3402         xldir = AllocateDir(XLOGDIR);
3403         if (xldir == NULL)
3404                 ereport(ERROR,
3405                                 (errcode_for_file_access(),
3406                                  errmsg("could not open transaction log directory \"%s\": %m",
3407                                                 XLOGDIR)));
3408
3409         XLogFileName(lastoff, ThisTimeLineID, log, seg);
3410
3411         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3412                  lastoff);
3413
3414         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3415         {
3416                 /*
3417                  * We ignore the timeline part of the XLOG segment identifiers in
3418                  * deciding whether a segment is still needed.  This ensures that we
3419                  * won't prematurely remove a segment from a parent timeline. We could
3420                  * probably be a little more proactive about removing segments of
3421                  * non-parent timelines, but that would be a whole lot more
3422                  * complicated.
3423                  *
3424                  * We use the alphanumeric sorting property of the filenames to decide
3425                  * which ones are earlier than the lastoff segment.
3426                  */
3427                 if (strlen(xlde->d_name) == 24 &&
3428                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3429                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3430                 {
3431                         if (RecoveryInProgress() || XLogArchiveCheckDone(xlde->d_name))
3432                         {
3433                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3434
3435                                 /* Update the last removed location in shared memory first */
3436                                 UpdateLastRemovedPtr(xlde->d_name);
3437
3438                                 /*
3439                                  * Before deleting the file, see if it can be recycled as a
3440                                  * future log segment. Only recycle normal files, pg_standby
3441                                  * for example can create symbolic links pointing to a
3442                                  * separate archive directory.
3443                                  */
3444                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3445                                         InstallXLogFileSegment(&endlogId, &endlogSeg, path,
3446                                                                                    true, &max_advance, true))
3447                                 {
3448                                         ereport(DEBUG2,
3449                                                         (errmsg("recycled transaction log file \"%s\"",
3450                                                                         xlde->d_name)));
3451                                         CheckpointStats.ckpt_segs_recycled++;
3452                                         /* Needn't recheck that slot on future iterations */
3453                                         if (max_advance > 0)
3454                                         {
3455                                                 NextLogSeg(endlogId, endlogSeg);
3456                                                 max_advance--;
3457                                         }
3458                                 }
3459                                 else
3460                                 {
3461                                         /* No need for any more future segments... */
3462                                         int                     rc;
3463
3464                                         ereport(DEBUG2,
3465                                                         (errmsg("removing transaction log file \"%s\"",
3466                                                                         xlde->d_name)));
3467
3468 #ifdef WIN32
3469
3470                                         /*
3471                                          * On Windows, if another process (e.g another backend)
3472                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
3473                                          * will succeed, but the file will still show up in
3474                                          * directory listing until the last handle is closed. To
3475                                          * avoid confusing the lingering deleted file for a live
3476                                          * WAL file that needs to be archived, rename it before
3477                                          * deleting it.
3478                                          *
3479                                          * If another process holds the file open without
3480                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
3481                                          * again at the next checkpoint.
3482                                          */
3483                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
3484                                         if (rename(path, newpath) != 0)
3485                                         {
3486                                                 ereport(LOG,
3487                                                                 (errcode_for_file_access(),
3488                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
3489                                                                                 path)));
3490                                                 continue;
3491                                         }
3492                                         rc = unlink(newpath);
3493 #else
3494                                         rc = unlink(path);
3495 #endif
3496                                         if (rc != 0)
3497                                         {
3498                                                 ereport(LOG,
3499                                                                 (errcode_for_file_access(),
3500                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
3501                                                                                 path)));
3502                                                 continue;
3503                                         }
3504                                         CheckpointStats.ckpt_segs_removed++;
3505                                 }
3506
3507                                 XLogArchiveCleanup(xlde->d_name);
3508                         }
3509                 }
3510         }
3511
3512         FreeDir(xldir);
3513 }
3514
3515 /*
3516  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3517  * If the latter does not exist, recreate it.
3518  *
3519  * It is not the goal of this function to verify the contents of these
3520  * directories, but to help in cases where someone has performed a cluster
3521  * copy for PITR purposes but omitted pg_xlog from the copy.
3522  *
3523  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3524  * policy decision was made not to.  It is fairly common for pg_xlog to be
3525  * a symlink, and if that was the DBA's intent then automatically making a
3526  * plain directory would result in degraded performance with no notice.
3527  */
3528 static void
3529 ValidateXLOGDirectoryStructure(void)
3530 {
3531         char            path[MAXPGPATH];
3532         struct stat stat_buf;
3533
3534         /* Check for pg_xlog; if it doesn't exist, error out */
3535         if (stat(XLOGDIR, &stat_buf) != 0 ||
3536                 !S_ISDIR(stat_buf.st_mode))
3537                 ereport(FATAL,
3538                                 (errmsg("required WAL directory \"%s\" does not exist",
3539                                                 XLOGDIR)));
3540
3541         /* Check for archive_status */
3542         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3543         if (stat(path, &stat_buf) == 0)
3544         {
3545                 /* Check for weird cases where it exists but isn't a directory */
3546                 if (!S_ISDIR(stat_buf.st_mode))
3547                         ereport(FATAL,
3548                                         (errmsg("required WAL directory \"%s\" does not exist",
3549                                                         path)));
3550         }
3551         else
3552         {
3553                 ereport(LOG,
3554                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3555                 if (mkdir(path, S_IRWXU) < 0)
3556                         ereport(FATAL,
3557                                         (errmsg("could not create missing directory \"%s\": %m",
3558                                                         path)));
3559         }
3560 }
3561
3562 /*
3563  * Remove previous backup history files.  This also retries creation of
3564  * .ready files for any backup history files for which XLogArchiveNotify
3565  * failed earlier.
3566  */
3567 static void
3568 CleanupBackupHistory(void)
3569 {
3570         DIR                *xldir;
3571         struct dirent *xlde;
3572         char            path[MAXPGPATH];
3573
3574         xldir = AllocateDir(XLOGDIR);
3575         if (xldir == NULL)
3576                 ereport(ERROR,
3577                                 (errcode_for_file_access(),
3578                                  errmsg("could not open transaction log directory \"%s\": %m",
3579                                                 XLOGDIR)));
3580
3581         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3582         {
3583                 if (strlen(xlde->d_name) > 24 &&
3584                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3585                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3586                                    ".backup") == 0)
3587                 {
3588                         if (XLogArchiveCheckDone(xlde->d_name))
3589                         {
3590                                 ereport(DEBUG2,
3591                                 (errmsg("removing transaction log backup history file \"%s\"",
3592                                                 xlde->d_name)));
3593                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3594                                 unlink(path);
3595                                 XLogArchiveCleanup(xlde->d_name);
3596                         }
3597                 }
3598         }
3599
3600         FreeDir(xldir);
3601 }
3602
3603 /*
3604  * Restore the backup blocks present in an XLOG record, if any.
3605  *
3606  * We assume all of the record has been read into memory at *record.
3607  *
3608  * Note: when a backup block is available in XLOG, we restore it
3609  * unconditionally, even if the page in the database appears newer.
3610  * This is to protect ourselves against database pages that were partially
3611  * or incorrectly written during a crash.  We assume that the XLOG data
3612  * must be good because it has passed a CRC check, while the database
3613  * page might not be.  This will force us to replay all subsequent
3614  * modifications of the page that appear in XLOG, rather than possibly
3615  * ignoring them as already applied, but that's not a huge drawback.
3616  *
3617  * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3618  * Otherwise, a normal exclusive lock is used.  During crash recovery, that's
3619  * just pro forma because there can't be any regular backends in the system,
3620  * but in hot standby mode the distinction is important. The 'cleanup'
3621  * argument applies to all backup blocks in the WAL record, that suffices for
3622  * now.
3623  */
3624 void
3625 RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3626 {
3627         Buffer          buffer;
3628         Page            page;
3629         BkpBlock        bkpb;
3630         char       *blk;
3631         int                     i;
3632
3633         if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
3634                 return;
3635
3636         blk = (char *) XLogRecGetData(record) + record->xl_len;
3637         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3638         {
3639                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3640                         continue;
3641
3642                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3643                 blk += sizeof(BkpBlock);
3644
3645                 buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
3646                                                                                 RBM_ZERO);
3647                 Assert(BufferIsValid(buffer));
3648                 if (cleanup)
3649                         LockBufferForCleanup(buffer);
3650                 else
3651                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3652
3653                 page = (Page) BufferGetPage(buffer);
3654
3655                 if (bkpb.hole_length == 0)
3656                 {
3657                         memcpy((char *) page, blk, BLCKSZ);
3658                 }
3659                 else
3660                 {
3661                         memcpy((char *) page, blk, bkpb.hole_offset);
3662                         /* must zero-fill the hole */
3663                         MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
3664                         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
3665                                    blk + bkpb.hole_offset,
3666                                    BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3667                 }
3668
3669                 PageSetLSN(page, lsn);
3670                 PageSetTLI(page, ThisTimeLineID);
3671                 MarkBufferDirty(buffer);
3672                 UnlockReleaseBuffer(buffer);
3673
3674                 blk += BLCKSZ - bkpb.hole_length;
3675         }
3676 }
3677
3678 /*
3679  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
3680  * record (other than to the minimal extent of computing the amount of
3681  * data to read in) until we've checked the CRCs.
3682  *
3683  * We assume all of the record has been read into memory at *record.
3684  */
3685 static bool
3686 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
3687 {
3688         pg_crc32        crc;
3689         int                     i;
3690         uint32          len = record->xl_len;
3691         BkpBlock        bkpb;
3692         char       *blk;
3693
3694         /* First the rmgr data */
3695         INIT_CRC32(crc);
3696         COMP_CRC32(crc, XLogRecGetData(record), len);
3697
3698         /* Add in the backup blocks, if any */
3699         blk = (char *) XLogRecGetData(record) + len;
3700         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3701         {
3702                 uint32          blen;
3703
3704                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3705                         continue;
3706
3707                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3708                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3709                 {
3710                         ereport(emode_for_corrupt_record(emode, recptr),
3711                                         (errmsg("incorrect hole size in record at %X/%X",
3712                                                         recptr.xlogid, recptr.xrecoff)));
3713                         return false;
3714                 }
3715                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
3716                 COMP_CRC32(crc, blk, blen);
3717                 blk += blen;
3718         }
3719
3720         /* Check that xl_tot_len agrees with our calculation */
3721         if (blk != (char *) record + record->xl_tot_len)
3722         {
3723                 ereport(emode_for_corrupt_record(emode, recptr),
3724                                 (errmsg("incorrect total length in record at %X/%X",
3725                                                 recptr.xlogid, recptr.xrecoff)));
3726                 return false;
3727         }
3728
3729         /* Finally include the record header */
3730         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
3731                            SizeOfXLogRecord - sizeof(pg_crc32));
3732         FIN_CRC32(crc);
3733
3734         if (!EQ_CRC32(record->xl_crc, crc))
3735         {
3736                 ereport(emode_for_corrupt_record(emode, recptr),
3737                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
3738                                 recptr.xlogid, recptr.xrecoff)));
3739                 return false;
3740         }
3741
3742         return true;
3743 }
3744
3745 /*
3746  * Attempt to read an XLOG record.
3747  *
3748  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3749  * try to read a record just after the last one previously read.
3750  *
3751  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3752  * (emode must be either PANIC, LOG)
3753  *
3754  * The record is copied into readRecordBuf, so that on successful return,
3755  * the returned record pointer always points there.
3756  */
3757 static XLogRecord *
3758 ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
3759 {
3760         XLogRecord *record;
3761         char       *buffer;
3762         XLogRecPtr      tmpRecPtr = EndRecPtr;
3763         bool            randAccess = false;
3764         uint32          len,
3765                                 total_len;
3766         uint32          targetRecOff;
3767         uint32          pageHeaderSize;
3768
3769         if (readBuf == NULL)
3770         {
3771                 /*
3772                  * First time through, permanently allocate readBuf.  We do it this
3773                  * way, rather than just making a static array, for two reasons: (1)
3774                  * no need to waste the storage in most instantiations of the backend;
3775                  * (2) a static char array isn't guaranteed to have any particular
3776                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
3777                  */
3778                 readBuf = (char *) malloc(XLOG_BLCKSZ);
3779                 Assert(readBuf != NULL);
3780         }
3781
3782         if (RecPtr == NULL)
3783         {
3784                 RecPtr = &tmpRecPtr;
3785
3786                 /*
3787                  * RecPtr is pointing to end+1 of the previous WAL record.  We must
3788                  * advance it if necessary to where the next record starts.  First,
3789                  * align to next page if no more records can fit on the current page.
3790                  */
3791                 if (XLOG_BLCKSZ - (RecPtr->xrecoff % XLOG_BLCKSZ) < SizeOfXLogRecord)
3792                         NextLogPage(*RecPtr);
3793
3794                 /* Check for crossing of xlog segment boundary */
3795                 if (RecPtr->xrecoff >= XLogFileSize)
3796                 {
3797                         (RecPtr->xlogid)++;
3798                         RecPtr->xrecoff = 0;
3799                 }
3800
3801                 /*
3802                  * If at page start, we must skip over the page header.  But we can't
3803                  * do that until we've read in the page, since the header size is
3804                  * variable.
3805                  */
3806         }
3807         else
3808         {
3809                 /*
3810                  * In this case, the passed-in record pointer should already be
3811                  * pointing to a valid record starting position.
3812                  */
3813                 if (!XRecOffIsValid(RecPtr->xrecoff))
3814                         ereport(PANIC,
3815                                         (errmsg("invalid record offset at %X/%X",
3816                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3817
3818                 /*
3819                  * Since we are going to a random position in WAL, forget any prior
3820                  * state about what timeline we were in, and allow it to be any
3821                  * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
3822                  * to go backwards (but we can't reset that variable right here, since
3823                  * we might not change files at all).
3824                  */
3825                 lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
3826                 randAccess = true;              /* allow curFileTLI to go backwards too */
3827         }
3828
3829         /* This is the first try to read this page. */
3830         failedSources = 0;
3831 retry:
3832         /* Read the page containing the record */
3833         if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess))
3834                 return NULL;
3835
3836         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3837         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3838         if (targetRecOff == 0)
3839         {
3840                 /*
3841                  * At page start, so skip over page header.  The Assert checks that
3842                  * we're not scribbling on caller's record pointer; it's OK because we
3843                  * can only get here in the continuing-from-prev-record case, since
3844                  * XRecOffIsValid rejected the zero-page-offset case otherwise.
3845                  */
3846                 Assert(RecPtr == &tmpRecPtr);
3847                 RecPtr->xrecoff += pageHeaderSize;
3848                 targetRecOff = pageHeaderSize;
3849         }
3850         else if (targetRecOff < pageHeaderSize)
3851         {
3852                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3853                                 (errmsg("invalid record offset at %X/%X",
3854                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3855                 goto next_record_is_invalid;
3856         }
3857         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3858                 targetRecOff == pageHeaderSize)
3859         {
3860                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3861                                 (errmsg("contrecord is requested by %X/%X",
3862                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3863                 goto next_record_is_invalid;
3864         }
3865         record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3866
3867         /*
3868          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
3869          * required.
3870          */
3871         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3872         {
3873                 if (record->xl_len != 0)
3874                 {
3875                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3876                                         (errmsg("invalid xlog switch record at %X/%X",
3877                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3878                         goto next_record_is_invalid;
3879                 }
3880         }
3881         else if (record->xl_len == 0)
3882         {
3883                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3884                                 (errmsg("record with zero length at %X/%X",
3885                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3886                 goto next_record_is_invalid;
3887         }
3888         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
3889                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
3890                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
3891         {
3892                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3893                                 (errmsg("invalid record length at %X/%X",
3894                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3895                 goto next_record_is_invalid;
3896         }
3897         if (record->xl_rmid > RM_MAX_ID)
3898         {
3899                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3900                                 (errmsg("invalid resource manager ID %u at %X/%X",
3901                                                 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3902                 goto next_record_is_invalid;
3903         }
3904         if (randAccess)
3905         {
3906                 /*
3907                  * We can't exactly verify the prev-link, but surely it should be less
3908                  * than the record's own address.
3909                  */
3910                 if (!XLByteLT(record->xl_prev, *RecPtr))
3911                 {
3912                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3913                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3914                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3915                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3916                         goto next_record_is_invalid;
3917                 }
3918         }
3919         else
3920         {
3921                 /*
3922                  * Record's prev-link should exactly match our previous location. This
3923                  * check guards against torn WAL pages where a stale but valid-looking
3924                  * WAL record starts on a sector boundary.
3925                  */
3926                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
3927                 {
3928                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3929                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3930                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3931                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3932                         goto next_record_is_invalid;
3933                 }
3934         }
3935
3936         /*
3937          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3938          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
3939          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
3940          * enough for all "normal" records, but very large commit or abort records
3941          * might need more space.)
3942          */
3943         total_len = record->xl_tot_len;
3944         if (total_len > readRecordBufSize)
3945         {
3946                 uint32          newSize = total_len;
3947
3948                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
3949                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3950                 if (readRecordBuf)
3951                         free(readRecordBuf);
3952                 readRecordBuf = (char *) malloc(newSize);
3953                 if (!readRecordBuf)
3954                 {
3955                         readRecordBufSize = 0;
3956                         /* We treat this as a "bogus data" condition */
3957                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3958                                         (errmsg("record length %u at %X/%X too long",
3959                                                         total_len, RecPtr->xlogid, RecPtr->xrecoff)));
3960                         goto next_record_is_invalid;
3961                 }
3962                 readRecordBufSize = newSize;
3963         }
3964
3965         buffer = readRecordBuf;
3966         len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
3967         if (total_len > len)
3968         {
3969                 /* Need to reassemble record */
3970                 XLogContRecord *contrecord;
3971                 XLogRecPtr      pagelsn;
3972                 uint32          gotlen = len;
3973
3974                 /* Initialize pagelsn to the beginning of the page this record is on */
3975                 pagelsn = *RecPtr;
3976                 pagelsn.xrecoff = (pagelsn.xrecoff / XLOG_BLCKSZ) * XLOG_BLCKSZ;
3977
3978                 memcpy(buffer, record, len);
3979                 record = (XLogRecord *) buffer;
3980                 buffer += len;
3981                 for (;;)
3982                 {
3983                         /* Calculate pointer to beginning of next page */
3984                         pagelsn.xrecoff += XLOG_BLCKSZ;
3985                         if (pagelsn.xrecoff >= XLogFileSize)
3986                         {
3987                                 (pagelsn.xlogid)++;
3988                                 pagelsn.xrecoff = 0;
3989                         }
3990                         /* Wait for the next page to become available */
3991                         if (!XLogPageRead(&pagelsn, emode, false, false))
3992                                 return NULL;
3993
3994                         /* Check that the continuation record looks valid */
3995                         if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3996                         {
3997                                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3998                                                 (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
3999                                                                 readId, readSeg, readOff)));
4000                                 goto next_record_is_invalid;
4001                         }
4002                         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
4003                         contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
4004                         if (contrecord->xl_rem_len == 0 ||
4005                                 total_len != (contrecord->xl_rem_len + gotlen))
4006                         {
4007                                 ereport(emode_for_corrupt_record(emode, *RecPtr),
4008                                                 (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
4009                                                                 contrecord->xl_rem_len,
4010                                                                 readId, readSeg, readOff)));
4011                                 goto next_record_is_invalid;
4012                         }
4013                         len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
4014                         if (contrecord->xl_rem_len > len)
4015                         {
4016                                 memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
4017                                 gotlen += len;
4018                                 buffer += len;
4019                                 continue;
4020                         }
4021                         memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
4022                                    contrecord->xl_rem_len);
4023                         break;
4024                 }
4025                 if (!RecordIsValid(record, *RecPtr, emode))
4026                         goto next_record_is_invalid;
4027                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
4028                 EndRecPtr.xlogid = readId;
4029                 EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
4030                         pageHeaderSize +
4031                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
4032
4033                 ReadRecPtr = *RecPtr;
4034                 /* needn't worry about XLOG SWITCH, it can't cross page boundaries */
4035                 return record;
4036         }
4037
4038         /* Record does not cross a page boundary */
4039         if (!RecordIsValid(record, *RecPtr, emode))
4040                 goto next_record_is_invalid;
4041         EndRecPtr.xlogid = RecPtr->xlogid;
4042         EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
4043
4044         ReadRecPtr = *RecPtr;
4045         memcpy(buffer, record, total_len);
4046
4047         /*
4048          * Special processing if it's an XLOG SWITCH record
4049          */
4050         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
4051         {
4052                 /* Pretend it extends to end of segment */
4053                 EndRecPtr.xrecoff += XLogSegSize - 1;
4054                 EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
4055
4056                 /*
4057                  * Pretend that readBuf contains the last page of the segment. This is
4058                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
4059                  * segment.
4060                  */
4061                 readOff = XLogSegSize - XLOG_BLCKSZ;
4062         }
4063         return (XLogRecord *) buffer;
4064
4065 next_record_is_invalid:
4066         failedSources |= readSource;
4067
4068         if (readFile >= 0)
4069         {
4070                 close(readFile);
4071                 readFile = -1;
4072         }
4073
4074         /* In standby-mode, keep trying */
4075         if (StandbyMode)
4076                 goto retry;
4077         else
4078                 return NULL;
4079 }
4080
4081 /*
4082  * Check whether the xlog header of a page just read in looks valid.
4083  *
4084  * This is just a convenience subroutine to avoid duplicated code in
4085  * ReadRecord.  It's not intended for use from anywhere else.
4086  */
4087 static bool
4088 ValidXLOGHeader(XLogPageHeader hdr, int emode)
4089 {
4090         XLogRecPtr      recaddr;
4091
4092         recaddr.xlogid = readId;
4093         recaddr.xrecoff = readSeg * XLogSegSize + readOff;
4094
4095         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
4096         {
4097                 ereport(emode_for_corrupt_record(emode, recaddr),
4098                                 (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
4099                                                 hdr->xlp_magic, readId, readSeg, readOff)));
4100                 return false;
4101         }
4102         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
4103         {
4104                 ereport(emode_for_corrupt_record(emode, recaddr),
4105                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
4106                                                 hdr->xlp_info, readId, readSeg, readOff)));
4107                 return false;
4108         }
4109         if (hdr->xlp_info & XLP_LONG_HEADER)
4110         {
4111                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
4112
4113                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
4114                 {
4115                         char            fhdrident_str[32];
4116                         char            sysident_str[32];
4117
4118                         /*
4119                          * Format sysids separately to keep platform-dependent format code
4120                          * out of the translatable message string.
4121                          */
4122                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
4123                                          longhdr->xlp_sysid);
4124                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
4125                                          ControlFile->system_identifier);
4126                         ereport(emode_for_corrupt_record(emode, recaddr),
4127                                         (errmsg("WAL file is from different database system"),
4128                                          errdetail("WAL file database system identifier is %s, pg_control database system identifier is %s.",
4129                                                            fhdrident_str, sysident_str)));
4130                         return false;
4131                 }
4132                 if (longhdr->xlp_seg_size != XLogSegSize)
4133                 {
4134                         ereport(emode_for_corrupt_record(emode, recaddr),
4135                                         (errmsg("WAL file is from different database system"),
4136                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
4137                         return false;
4138                 }
4139                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
4140                 {
4141                         ereport(emode_for_corrupt_record(emode, recaddr),
4142                                         (errmsg("WAL file is from different database system"),
4143                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
4144                         return false;
4145                 }
4146         }
4147         else if (readOff == 0)
4148         {
4149                 /* hmm, first page of file doesn't have a long header? */
4150                 ereport(emode_for_corrupt_record(emode, recaddr),
4151                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
4152                                                 hdr->xlp_info, readId, readSeg, readOff)));
4153                 return false;
4154         }
4155
4156         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
4157         {
4158                 ereport(emode_for_corrupt_record(emode, recaddr),
4159                                 (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
4160                                                 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
4161                                                 readId, readSeg, readOff)));
4162                 return false;
4163         }
4164
4165         /*
4166          * Check page TLI is one of the expected values.
4167          */
4168         if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
4169         {
4170                 ereport(emode_for_corrupt_record(emode, recaddr),
4171                                 (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
4172                                                 hdr->xlp_tli,
4173                                                 readId, readSeg, readOff)));
4174                 return false;
4175         }
4176
4177         /*
4178          * Since child timelines are always assigned a TLI greater than their
4179          * immediate parent's TLI, we should never see TLI go backwards across
4180          * successive pages of a consistent WAL sequence.
4181          *
4182          * Of course this check should only be applied when advancing sequentially
4183          * across pages; therefore ReadRecord resets lastPageTLI to zero when
4184          * going to a random page.
4185          */
4186         if (hdr->xlp_tli < lastPageTLI)
4187         {
4188                 ereport(emode_for_corrupt_record(emode, recaddr),
4189                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
4190                                                 hdr->xlp_tli, lastPageTLI,
4191                                                 readId, readSeg, readOff)));
4192                 return false;
4193         }
4194         lastPageTLI = hdr->xlp_tli;
4195         return true;
4196 }
4197
4198 /*
4199  * Try to read a timeline's history file.
4200  *
4201  * If successful, return the list of component TLIs (the given TLI followed by
4202  * its ancestor TLIs).  If we can't find the history file, assume that the
4203  * timeline has no parents, and return a list of just the specified timeline
4204  * ID.
4205  */
4206 static List *
4207 readTimeLineHistory(TimeLineID targetTLI)
4208 {
4209         List       *result;
4210         char            path[MAXPGPATH];
4211         char            histfname[MAXFNAMELEN];
4212         char            fline[MAXPGPATH];
4213         FILE       *fd;
4214
4215         /* Timeline 1 does not have a history file, so no need to check */
4216         if (targetTLI == 1)
4217                 return list_make1_int((int) targetTLI);
4218
4219         if (InArchiveRecovery)
4220         {
4221                 TLHistoryFileName(histfname, targetTLI);
4222                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4223         }
4224         else
4225                 TLHistoryFilePath(path, targetTLI);
4226
4227         fd = AllocateFile(path, "r");
4228         if (fd == NULL)
4229         {
4230                 if (errno != ENOENT)
4231                         ereport(FATAL,
4232                                         (errcode_for_file_access(),
4233                                          errmsg("could not open file \"%s\": %m", path)));
4234                 /* Not there, so assume no parents */
4235                 return list_make1_int((int) targetTLI);
4236         }
4237
4238         result = NIL;
4239
4240         /*
4241          * Parse the file...
4242          */
4243         while (fgets(fline, sizeof(fline), fd) != NULL)
4244         {
4245                 /* skip leading whitespace and check for # comment */
4246                 char       *ptr;
4247                 char       *endptr;
4248                 TimeLineID      tli;
4249
4250                 for (ptr = fline; *ptr; ptr++)
4251                 {
4252                         if (!isspace((unsigned char) *ptr))
4253                                 break;
4254                 }
4255                 if (*ptr == '\0' || *ptr == '#')
4256                         continue;
4257
4258                 /* expect a numeric timeline ID as first field of line */
4259                 tli = (TimeLineID) strtoul(ptr, &endptr, 0);
4260                 if (endptr == ptr)
4261                         ereport(FATAL,
4262                                         (errmsg("syntax error in history file: %s", fline),
4263                                          errhint("Expected a numeric timeline ID.")));
4264
4265                 if (result &&
4266                         tli <= (TimeLineID) linitial_int(result))
4267                         ereport(FATAL,
4268                                         (errmsg("invalid data in history file: %s", fline),
4269                                    errhint("Timeline IDs must be in increasing sequence.")));
4270
4271                 /* Build list with newest item first */
4272                 result = lcons_int((int) tli, result);
4273
4274                 /* we ignore the remainder of each line */
4275         }
4276
4277         FreeFile(fd);
4278
4279         if (result &&
4280                 targetTLI <= (TimeLineID) linitial_int(result))
4281                 ereport(FATAL,
4282                                 (errmsg("invalid data in history file \"%s\"", path),
4283                         errhint("Timeline IDs must be less than child timeline's ID.")));
4284
4285         result = lcons_int((int) targetTLI, result);
4286
4287         ereport(DEBUG3,
4288                         (errmsg_internal("history of timeline %u is %s",
4289                                                          targetTLI, nodeToString(result))));
4290
4291         return result;
4292 }
4293
4294 /*
4295  * Probe whether a timeline history file exists for the given timeline ID
4296  */
4297 static bool
4298 existsTimeLineHistory(TimeLineID probeTLI)
4299 {
4300         char            path[MAXPGPATH];
4301         char            histfname[MAXFNAMELEN];
4302         FILE       *fd;
4303
4304         /* Timeline 1 does not have a history file, so no need to check */
4305         if (probeTLI == 1)
4306                 return false;
4307
4308         if (InArchiveRecovery)
4309         {
4310                 TLHistoryFileName(histfname, probeTLI);
4311                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4312         }
4313         else
4314                 TLHistoryFilePath(path, probeTLI);
4315
4316         fd = AllocateFile(path, "r");
4317         if (fd != NULL)
4318         {
4319                 FreeFile(fd);
4320                 return true;
4321         }
4322         else
4323         {
4324                 if (errno != ENOENT)
4325                         ereport(FATAL,
4326                                         (errcode_for_file_access(),
4327                                          errmsg("could not open file \"%s\": %m", path)));
4328                 return false;
4329         }
4330 }
4331
4332 /*
4333  * Scan for new timelines that might have appeared in the archive since we
4334  * started recovery.
4335  *
4336  * If there are any, the function changes recovery target TLI to the latest
4337  * one and returns 'true'.
4338  */
4339 static bool
4340 rescanLatestTimeLine(void)
4341 {
4342         TimeLineID      newtarget;
4343
4344         newtarget = findNewestTimeLine(recoveryTargetTLI);
4345         if (newtarget != recoveryTargetTLI)
4346         {
4347                 /*
4348                  * Determine the list of expected TLIs for the new TLI
4349                  */
4350                 List       *newExpectedTLIs;
4351
4352                 newExpectedTLIs = readTimeLineHistory(newtarget);
4353
4354                 /*
4355                  * If the current timeline is not part of the history of the new
4356                  * timeline, we cannot proceed to it.
4357                  *
4358                  * XXX This isn't foolproof: The new timeline might have forked from
4359                  * the current one, but before the current recovery location. In that
4360                  * case we will still switch to the new timeline and proceed replaying
4361                  * from it even though the history doesn't match what we already
4362                  * replayed. That's not good. We will likely notice at the next online
4363                  * checkpoint, as the TLI won't match what we expected, but it's not
4364                  * guaranteed. The admin needs to make sure that doesn't happen.
4365                  */
4366                 if (!list_member_int(newExpectedTLIs,
4367                                                          (int) recoveryTargetTLI))
4368                         ereport(LOG,
4369                                         (errmsg("new timeline %u is not a child of database system timeline %u",
4370                                                         newtarget,
4371                                                         ThisTimeLineID)));
4372                 else
4373                 {
4374                         /* Switch target */
4375                         recoveryTargetTLI = newtarget;
4376                         list_free(expectedTLIs);
4377                         expectedTLIs = newExpectedTLIs;
4378
4379                         XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
4380
4381                         ereport(LOG,
4382                                         (errmsg("new target timeline is %u",
4383                                                         recoveryTargetTLI)));
4384                         return true;
4385                 }
4386         }
4387         return false;
4388 }
4389
4390 /*
4391  * Find the newest existing timeline, assuming that startTLI exists.
4392  *
4393  * Note: while this is somewhat heuristic, it does positively guarantee
4394  * that (result + 1) is not a known timeline, and therefore it should
4395  * be safe to assign that ID to a new timeline.
4396  */
4397 static TimeLineID
4398 findNewestTimeLine(TimeLineID startTLI)
4399 {
4400         TimeLineID      newestTLI;
4401         TimeLineID      probeTLI;
4402
4403         /*
4404          * The algorithm is just to probe for the existence of timeline history
4405          * files.  XXX is it useful to allow gaps in the sequence?
4406          */
4407         newestTLI = startTLI;
4408
4409         for (probeTLI = startTLI + 1;; probeTLI++)
4410         {
4411                 if (existsTimeLineHistory(probeTLI))
4412                 {
4413                         newestTLI = probeTLI;           /* probeTLI exists */
4414                 }
4415                 else
4416                 {
4417                         /* doesn't exist, assume we're done */
4418                         break;
4419                 }
4420         }
4421
4422         return newestTLI;
4423 }
4424
4425 /*
4426  * Create a new timeline history file.
4427  *
4428  *      newTLI: ID of the new timeline
4429  *      parentTLI: ID of its immediate parent
4430  *      endTLI et al: ID of the last used WAL file, for annotation purposes
4431  *
4432  * Currently this is only used during recovery, and so there are no locking
4433  * considerations.      But we should be just as tense as XLogFileInit to avoid
4434  * emplacing a bogus file.
4435  */
4436 static void
4437 writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
4438                                          TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4439 {
4440         char            path[MAXPGPATH];
4441         char            tmppath[MAXPGPATH];
4442         char            histfname[MAXFNAMELEN];
4443         char            xlogfname[MAXFNAMELEN];
4444         char            buffer[BLCKSZ];
4445         int                     srcfd;
4446         int                     fd;
4447         int                     nbytes;
4448
4449         Assert(newTLI > parentTLI); /* else bad selection of newTLI */
4450
4451         /*
4452          * Write into a temp file name.
4453          */
4454         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
4455
4456         unlink(tmppath);
4457
4458         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
4459         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
4460                                            S_IRUSR | S_IWUSR);
4461         if (fd < 0)
4462                 ereport(ERROR,
4463                                 (errcode_for_file_access(),
4464                                  errmsg("could not create file \"%s\": %m", tmppath)));
4465
4466         /*
4467          * If a history file exists for the parent, copy it verbatim
4468          */
4469         if (InArchiveRecovery)
4470         {
4471                 TLHistoryFileName(histfname, parentTLI);
4472                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4473         }
4474         else
4475                 TLHistoryFilePath(path, parentTLI);
4476
4477         srcfd = BasicOpenFile(path, O_RDONLY, 0);
4478         if (srcfd < 0)
4479         {
4480                 if (errno != ENOENT)
4481                         ereport(ERROR,
4482                                         (errcode_for_file_access(),
4483                                          errmsg("could not open file \"%s\": %m", path)));
4484                 /* Not there, so assume parent has no parents */
4485         }
4486         else
4487         {
4488                 for (;;)
4489                 {
4490                         errno = 0;
4491                         nbytes = (int) read(srcfd, buffer, sizeof(buffer));
4492                         if (nbytes < 0 || errno != 0)
4493                                 ereport(ERROR,
4494                                                 (errcode_for_file_access(),
4495                                                  errmsg("could not read file \"%s\": %m", path)));
4496                         if (nbytes == 0)
4497                                 break;
4498                         errno = 0;
4499                         if ((int) write(fd, buffer, nbytes) != nbytes)
4500                         {
4501                                 int                     save_errno = errno;
4502
4503                                 /*
4504                                  * If we fail to make the file, delete it to release disk
4505                                  * space
4506                                  */
4507                                 unlink(tmppath);
4508
4509                                 /*
4510                                  * if write didn't set errno, assume problem is no disk space
4511                                  */
4512                                 errno = save_errno ? save_errno : ENOSPC;
4513
4514                                 ereport(ERROR,
4515                                                 (errcode_for_file_access(),
4516                                          errmsg("could not write to file \"%s\": %m", tmppath)));
4517                         }
4518                 }
4519                 close(srcfd);
4520         }
4521
4522         /*
4523          * Append one line with the details of this timeline split.
4524          *
4525          * If we did have a parent file, insert an extra newline just in case the
4526          * parent file failed to end with one.
4527          */
4528         XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
4529
4530         /*
4531          * Write comment to history file to explain why and where timeline
4532          * changed. Comment varies according to the recovery target used.
4533          */
4534         if (recoveryTarget == RECOVERY_TARGET_XID)
4535                 snprintf(buffer, sizeof(buffer),
4536                                  "%s%u\t%s\t%s transaction %u\n",
4537                                  (srcfd < 0) ? "" : "\n",
4538                                  parentTLI,
4539                                  xlogfname,
4540                                  recoveryStopAfter ? "after" : "before",
4541                                  recoveryStopXid);
4542         else if (recoveryTarget == RECOVERY_TARGET_TIME)
4543                 snprintf(buffer, sizeof(buffer),
4544                                  "%s%u\t%s\t%s %s\n",
4545                                  (srcfd < 0) ? "" : "\n",
4546                                  parentTLI,
4547                                  xlogfname,
4548                                  recoveryStopAfter ? "after" : "before",
4549                                  timestamptz_to_str(recoveryStopTime));
4550         else if (recoveryTarget == RECOVERY_TARGET_NAME)
4551                 snprintf(buffer, sizeof(buffer),
4552                                  "%s%u\t%s\tat restore point \"%s\"\n",
4553                                  (srcfd < 0) ? "" : "\n",
4554                                  parentTLI,
4555                                  xlogfname,
4556                                  recoveryStopName);
4557         else
4558                 snprintf(buffer, sizeof(buffer),
4559                                  "%s%u\t%s\tno recovery target specified\n",
4560                                  (srcfd < 0) ? "" : "\n",
4561                                  parentTLI,
4562                                  xlogfname);
4563
4564         nbytes = strlen(buffer);
4565         errno = 0;
4566         if ((int) write(fd, buffer, nbytes) != nbytes)
4567         {
4568                 int                     save_errno = errno;
4569
4570                 /*
4571                  * If we fail to make the file, delete it to release disk space
4572                  */
4573                 unlink(tmppath);
4574                 /* if write didn't set errno, assume problem is no disk space */
4575                 errno = save_errno ? save_errno : ENOSPC;
4576
4577                 ereport(ERROR,
4578                                 (errcode_for_file_access(),
4579                                  errmsg("could not write to file \"%s\": %m", tmppath)));
4580         }
4581
4582         if (pg_fsync(fd) != 0)
4583                 ereport(ERROR,
4584                                 (errcode_for_file_access(),
4585                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
4586
4587         if (close(fd))
4588                 ereport(ERROR,
4589                                 (errcode_for_file_access(),
4590                                  errmsg("could not close file \"%s\": %m", tmppath)));
4591
4592
4593         /*
4594          * Now move the completed history file into place with its final name.
4595          */
4596         TLHistoryFilePath(path, newTLI);
4597
4598         /*
4599          * Prefer link() to rename() here just to be really sure that we don't
4600          * overwrite an existing logfile.  However, there shouldn't be one, so
4601          * rename() is an acceptable substitute except for the truly paranoid.
4602          */
4603 #if HAVE_WORKING_LINK
4604         if (link(tmppath, path) < 0)
4605                 ereport(ERROR,
4606                                 (errcode_for_file_access(),
4607                                  errmsg("could not link file \"%s\" to \"%s\": %m",
4608                                                 tmppath, path)));
4609         unlink(tmppath);
4610 #else
4611         if (rename(tmppath, path) < 0)
4612                 ereport(ERROR,
4613                                 (errcode_for_file_access(),
4614                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4615                                                 tmppath, path)));
4616 #endif
4617
4618         /* The history file can be archived immediately. */
4619         TLHistoryFileName(histfname, newTLI);
4620         XLogArchiveNotify(histfname);
4621 }
4622
4623 /*
4624  * I/O routines for pg_control
4625  *
4626  * *ControlFile is a buffer in shared memory that holds an image of the
4627  * contents of pg_control.      WriteControlFile() initializes pg_control
4628  * given a preloaded buffer, ReadControlFile() loads the buffer from
4629  * the pg_control file (during postmaster or standalone-backend startup),
4630  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4631  *
4632  * For simplicity, WriteControlFile() initializes the fields of pg_control
4633  * that are related to checking backend/database compatibility, and
4634  * ReadControlFile() verifies they are correct.  We could split out the
4635  * I/O and compatibility-check functions, but there seems no need currently.
4636  */
4637 static void
4638 WriteControlFile(void)
4639 {
4640         int                     fd;
4641         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4642
4643         /*
4644          * Initialize version and compatibility-check fields
4645          */
4646         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4647         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4648
4649         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4650         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4651
4652         ControlFile->blcksz = BLCKSZ;
4653         ControlFile->relseg_size = RELSEG_SIZE;
4654         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4655         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4656
4657         ControlFile->nameDataLen = NAMEDATALEN;
4658         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4659
4660         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4661
4662 #ifdef HAVE_INT64_TIMESTAMP
4663         ControlFile->enableIntTimes = true;
4664 #else
4665         ControlFile->enableIntTimes = false;
4666 #endif
4667         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4668         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4669
4670         /* Contents are protected with a CRC */
4671         INIT_CRC32(ControlFile->crc);
4672         COMP_CRC32(ControlFile->crc,
4673                            (char *) ControlFile,
4674                            offsetof(ControlFileData, crc));
4675         FIN_CRC32(ControlFile->crc);
4676
4677         /*
4678          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4679          * excess over sizeof(ControlFileData).  This reduces the odds of
4680          * premature-EOF errors when reading pg_control.  We'll still fail when we
4681          * check the contents of the file, but hopefully with a more specific
4682          * error than "couldn't read pg_control".
4683          */
4684         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4685                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4686
4687         memset(buffer, 0, PG_CONTROL_SIZE);
4688         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4689
4690         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4691                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4692                                            S_IRUSR | S_IWUSR);
4693         if (fd < 0)
4694                 ereport(PANIC,
4695                                 (errcode_for_file_access(),
4696                                  errmsg("could not create control file \"%s\": %m",
4697                                                 XLOG_CONTROL_FILE)));
4698
4699         errno = 0;
4700         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4701         {
4702                 /* if write didn't set errno, assume problem is no disk space */
4703                 if (errno == 0)
4704                         errno = ENOSPC;
4705                 ereport(PANIC,
4706                                 (errcode_for_file_access(),
4707                                  errmsg("could not write to control file: %m")));
4708         }
4709
4710         if (pg_fsync(fd) != 0)
4711                 ereport(PANIC,
4712                                 (errcode_for_file_access(),
4713                                  errmsg("could not fsync control file: %m")));
4714
4715         if (close(fd))
4716                 ereport(PANIC,
4717                                 (errcode_for_file_access(),
4718                                  errmsg("could not close control file: %m")));
4719 }
4720
4721 static void
4722 ReadControlFile(void)
4723 {
4724         pg_crc32        crc;
4725         int                     fd;
4726
4727         /*
4728          * Read data...
4729          */
4730         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4731                                            O_RDWR | PG_BINARY,
4732                                            S_IRUSR | S_IWUSR);
4733         if (fd < 0)
4734                 ereport(PANIC,
4735                                 (errcode_for_file_access(),
4736                                  errmsg("could not open control file \"%s\": %m",
4737                                                 XLOG_CONTROL_FILE)));
4738
4739         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4740                 ereport(PANIC,
4741                                 (errcode_for_file_access(),
4742                                  errmsg("could not read from control file: %m")));
4743
4744         close(fd);
4745
4746         /*
4747          * Check for expected pg_control format version.  If this is wrong, the
4748          * CRC check will likely fail because we'll be checking the wrong number
4749          * of bytes.  Complaining about wrong version will probably be more
4750          * enlightening than complaining about wrong CRC.
4751          */
4752
4753         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4754                 ereport(FATAL,
4755                                 (errmsg("database files are incompatible with server"),
4756                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4757                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4758                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4759                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4760                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4761
4762         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4763                 ereport(FATAL,
4764                                 (errmsg("database files are incompatible with server"),
4765                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4766                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4767                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4768                                  errhint("It looks like you need to initdb.")));
4769
4770         /* Now check the CRC. */
4771         INIT_CRC32(crc);
4772         COMP_CRC32(crc,
4773                            (char *) ControlFile,
4774                            offsetof(ControlFileData, crc));
4775         FIN_CRC32(crc);
4776
4777         if (!EQ_CRC32(crc, ControlFile->crc))
4778                 ereport(FATAL,
4779                                 (errmsg("incorrect checksum in control file")));
4780
4781         /*
4782          * Do compatibility checking immediately.  If the database isn't
4783          * compatible with the backend executable, we want to abort before we can
4784          * possibly do any damage.
4785          */
4786         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4787                 ereport(FATAL,
4788                                 (errmsg("database files are incompatible with server"),
4789                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4790                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4791                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4792                                  errhint("It looks like you need to initdb.")));
4793         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4794                 ereport(FATAL,
4795                                 (errmsg("database files are incompatible with server"),
4796                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4797                                          " but the server was compiled with MAXALIGN %d.",
4798                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4799                                  errhint("It looks like you need to initdb.")));
4800         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4801                 ereport(FATAL,
4802                                 (errmsg("database files are incompatible with server"),
4803                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4804                                  errhint("It looks like you need to initdb.")));
4805         if (ControlFile->blcksz != BLCKSZ)
4806                 ereport(FATAL,
4807                                 (errmsg("database files are incompatible with server"),
4808                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4809                                            " but the server was compiled with BLCKSZ %d.",
4810                                            ControlFile->blcksz, BLCKSZ),
4811                                  errhint("It looks like you need to recompile or initdb.")));
4812         if (ControlFile->relseg_size != RELSEG_SIZE)
4813                 ereport(FATAL,
4814                                 (errmsg("database files are incompatible with server"),
4815                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4816                                   " but the server was compiled with RELSEG_SIZE %d.",
4817                                   ControlFile->relseg_size, RELSEG_SIZE),
4818                                  errhint("It looks like you need to recompile or initdb.")));
4819         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4820                 ereport(FATAL,
4821                                 (errmsg("database files are incompatible with server"),
4822                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4823                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4824                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4825                                  errhint("It looks like you need to recompile or initdb.")));
4826         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4827                 ereport(FATAL,
4828                                 (errmsg("database files are incompatible with server"),
4829                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4830                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4831                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4832                                  errhint("It looks like you need to recompile or initdb.")));
4833         if (ControlFile->nameDataLen != NAMEDATALEN)
4834                 ereport(FATAL,
4835                                 (errmsg("database files are incompatible with server"),
4836                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4837                                   " but the server was compiled with NAMEDATALEN %d.",
4838                                   ControlFile->nameDataLen, NAMEDATALEN),
4839                                  errhint("It looks like you need to recompile or initdb.")));
4840         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4841                 ereport(FATAL,
4842                                 (errmsg("database files are incompatible with server"),
4843                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4844                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4845                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4846                                  errhint("It looks like you need to recompile or initdb.")));
4847         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4848                 ereport(FATAL,
4849                                 (errmsg("database files are incompatible with server"),
4850                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4851                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4852                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4853                                  errhint("It looks like you need to recompile or initdb.")));
4854
4855 #ifdef HAVE_INT64_TIMESTAMP
4856         if (ControlFile->enableIntTimes != true)
4857                 ereport(FATAL,
4858                                 (errmsg("database files are incompatible with server"),
4859                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4860                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4861                                  errhint("It looks like you need to recompile or initdb.")));
4862 #else
4863         if (ControlFile->enableIntTimes != false)
4864                 ereport(FATAL,
4865                                 (errmsg("database files are incompatible with server"),
4866                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4867                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4868                                  errhint("It looks like you need to recompile or initdb.")));
4869 #endif
4870
4871 #ifdef USE_FLOAT4_BYVAL
4872         if (ControlFile->float4ByVal != true)
4873                 ereport(FATAL,
4874                                 (errmsg("database files are incompatible with server"),
4875                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4876                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4877                                  errhint("It looks like you need to recompile or initdb.")));
4878 #else
4879         if (ControlFile->float4ByVal != false)
4880                 ereport(FATAL,
4881                                 (errmsg("database files are incompatible with server"),
4882                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4883                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4884                                  errhint("It looks like you need to recompile or initdb.")));
4885 #endif
4886
4887 #ifdef USE_FLOAT8_BYVAL
4888         if (ControlFile->float8ByVal != true)
4889                 ereport(FATAL,
4890                                 (errmsg("database files are incompatible with server"),
4891                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4892                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4893                                  errhint("It looks like you need to recompile or initdb.")));
4894 #else
4895         if (ControlFile->float8ByVal != false)
4896                 ereport(FATAL,
4897                                 (errmsg("database files are incompatible with server"),
4898                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4899                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4900                                  errhint("It looks like you need to recompile or initdb.")));
4901 #endif
4902 }
4903
4904 void
4905 UpdateControlFile(void)
4906 {
4907         int                     fd;
4908
4909         INIT_CRC32(ControlFile->crc);
4910         COMP_CRC32(ControlFile->crc,
4911                            (char *) ControlFile,
4912                            offsetof(ControlFileData, crc));
4913         FIN_CRC32(ControlFile->crc);
4914
4915         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4916                                            O_RDWR | PG_BINARY,
4917                                            S_IRUSR | S_IWUSR);
4918         if (fd < 0)
4919                 ereport(PANIC,
4920                                 (errcode_for_file_access(),
4921                                  errmsg("could not open control file \"%s\": %m",
4922                                                 XLOG_CONTROL_FILE)));
4923
4924         errno = 0;
4925         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4926         {
4927                 /* if write didn't set errno, assume problem is no disk space */
4928                 if (errno == 0)
4929                         errno = ENOSPC;
4930                 ereport(PANIC,
4931                                 (errcode_for_file_access(),
4932                                  errmsg("could not write to control file: %m")));
4933         }
4934
4935         if (pg_fsync(fd) != 0)
4936                 ereport(PANIC,
4937                                 (errcode_for_file_access(),
4938                                  errmsg("could not fsync control file: %m")));
4939
4940         if (close(fd))
4941                 ereport(PANIC,
4942                                 (errcode_for_file_access(),
4943                                  errmsg("could not close control file: %m")));
4944 }
4945
4946 /*
4947  * Returns the unique system identifier from control file.
4948  */
4949 uint64
4950 GetSystemIdentifier(void)
4951 {
4952         Assert(ControlFile != NULL);
4953         return ControlFile->system_identifier;
4954 }
4955
4956 /*
4957  * Auto-tune the number of XLOG buffers.
4958  *
4959  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4960  * a maximum of one XLOG segment (there is little reason to think that more
4961  * is helpful, at least so long as we force an fsync when switching log files)
4962  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4963  * 9.1, when auto-tuning was added).
4964  *
4965  * This should not be called until NBuffers has received its final value.
4966  */
4967 static int
4968 XLOGChooseNumBuffers(void)
4969 {
4970         int                     xbuffers;
4971
4972         xbuffers = NBuffers / 32;
4973         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4974                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4975         if (xbuffers < 8)
4976                 xbuffers = 8;
4977         return xbuffers;
4978 }
4979
4980 /*
4981  * GUC check_hook for wal_buffers
4982  */
4983 bool
4984 check_wal_buffers(int *newval, void **extra, GucSource source)
4985 {
4986         /*
4987          * -1 indicates a request for auto-tune.
4988          */
4989         if (*newval == -1)
4990         {
4991                 /*
4992                  * If we haven't yet changed the boot_val default of -1, just let it
4993                  * be.  We'll fix it when XLOGShmemSize is called.
4994                  */
4995                 if (XLOGbuffers == -1)
4996                         return true;
4997
4998                 /* Otherwise, substitute the auto-tune value */
4999                 *newval = XLOGChooseNumBuffers();
5000         }
5001
5002         /*
5003          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
5004          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
5005          * the case, we just silently treat such values as a request for the
5006          * minimum.  (We could throw an error instead, but that doesn't seem very
5007          * helpful.)
5008          */
5009         if (*newval < 4)
5010                 *newval = 4;
5011
5012         return true;
5013 }
5014
5015 /*
5016  * Initialization of shared memory for XLOG
5017  */
5018 Size
5019 XLOGShmemSize(void)
5020 {
5021         Size            size;
5022
5023         /*
5024          * If the value of wal_buffers is -1, use the preferred auto-tune value.
5025          * This isn't an amazingly clean place to do this, but we must wait till
5026          * NBuffers has received its final value, and must do it before using the
5027          * value of XLOGbuffers to do anything important.
5028          */
5029         if (XLOGbuffers == -1)
5030         {
5031                 char            buf[32];
5032
5033                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5034                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5035         }
5036         Assert(XLOGbuffers > 0);
5037
5038         /* XLogCtl */
5039         size = sizeof(XLogCtlData);
5040         /* xlblocks array */
5041         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5042         /* extra alignment padding for XLOG I/O buffers */
5043         size = add_size(size, ALIGNOF_XLOG_BUFFER);
5044         /* and the buffers themselves */
5045         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5046
5047         /*
5048          * Note: we don't count ControlFileData, it comes out of the "slop factor"
5049          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5050          * routine again below to compute the actual allocation size.
5051          */
5052
5053         return size;
5054 }
5055
5056 void
5057 XLOGShmemInit(void)
5058 {
5059         bool            foundCFile,
5060                                 foundXLog;
5061         char       *allocptr;
5062
5063         ControlFile = (ControlFileData *)
5064                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5065         XLogCtl = (XLogCtlData *)
5066                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5067
5068         if (foundCFile || foundXLog)
5069         {
5070                 /* both should be present or neither */
5071                 Assert(foundCFile && foundXLog);
5072                 return;
5073         }
5074
5075         memset(XLogCtl, 0, sizeof(XLogCtlData));
5076
5077         /*
5078          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5079          * multiple of the alignment for same, so no extra alignment padding is
5080          * needed here.
5081          */
5082         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5083         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5084         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5085         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5086
5087         /*
5088          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
5089          */
5090         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
5091         XLogCtl->pages = allocptr;
5092         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5093
5094         /*
5095          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5096          * in additional info.)
5097          */
5098         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5099         XLogCtl->SharedRecoveryInProgress = true;
5100         XLogCtl->SharedHotStandbyActive = false;
5101         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
5102         SpinLockInit(&XLogCtl->info_lck);
5103         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5104         InitSharedLatch(&XLogCtl->WALWriterLatch);
5105
5106         /*
5107          * If we are not in bootstrap mode, pg_control should already exist. Read
5108          * and validate it immediately (see comments in ReadControlFile() for the
5109          * reasons why).
5110          */
5111         if (!IsBootstrapProcessingMode())
5112                 ReadControlFile();
5113 }
5114
5115 /*
5116  * This func must be called ONCE on system install.  It creates pg_control
5117  * and the initial XLOG segment.
5118  */
5119 void
5120 BootStrapXLOG(void)
5121 {
5122         CheckPoint      checkPoint;
5123         char       *buffer;
5124         XLogPageHeader page;
5125         XLogLongPageHeader longpage;
5126         XLogRecord *record;
5127         bool            use_existent;
5128         uint64          sysidentifier;
5129         struct timeval tv;
5130         pg_crc32        crc;
5131
5132         /*
5133          * Select a hopefully-unique system identifier code for this installation.
5134          * We use the result of gettimeofday(), including the fractional seconds
5135          * field, as being about as unique as we can easily get.  (Think not to
5136          * use random(), since it hasn't been seeded and there's no portable way
5137          * to seed it other than the system clock value...)  The upper half of the
5138          * uint64 value is just the tv_sec part, while the lower half is the XOR
5139          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
5140          * unnecessarily if "uint64" is really only 32 bits wide.  A person
5141          * knowing this encoding can determine the initialization time of the
5142          * installation, which could perhaps be useful sometimes.
5143          */
5144         gettimeofday(&tv, NULL);
5145         sysidentifier = ((uint64) tv.tv_sec) << 32;
5146         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
5147
5148         /* First timeline ID is always 1 */
5149         ThisTimeLineID = 1;
5150
5151         /* page buffer must be aligned suitably for O_DIRECT */
5152         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
5153         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
5154         memset(page, 0, XLOG_BLCKSZ);
5155
5156         /*
5157          * Set up information for the initial checkpoint record
5158          *
5159          * The initial checkpoint record is written to the beginning of the WAL
5160          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5161          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5162          */
5163         checkPoint.redo.xlogid = 0;
5164         checkPoint.redo.xrecoff = XLogSegSize + SizeOfXLogLongPHD;
5165         checkPoint.ThisTimeLineID = ThisTimeLineID;
5166         checkPoint.fullPageWrites = fullPageWrites;
5167         checkPoint.nextXidEpoch = 0;
5168         checkPoint.nextXid = FirstNormalTransactionId;
5169         checkPoint.nextOid = FirstBootstrapObjectId;
5170         checkPoint.nextMulti = FirstMultiXactId;
5171         checkPoint.nextMultiOffset = 0;
5172         checkPoint.oldestXid = FirstNormalTransactionId;
5173         checkPoint.oldestXidDB = TemplateDbOid;
5174         checkPoint.time = (pg_time_t) time(NULL);
5175         checkPoint.oldestActiveXid = InvalidTransactionId;
5176
5177         ShmemVariableCache->nextXid = checkPoint.nextXid;
5178         ShmemVariableCache->nextOid = checkPoint.nextOid;
5179         ShmemVariableCache->oidCount = 0;
5180         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5181         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5182
5183         /* Set up the XLOG page header */
5184         page->xlp_magic = XLOG_PAGE_MAGIC;
5185         page->xlp_info = XLP_LONG_HEADER;
5186         page->xlp_tli = ThisTimeLineID;
5187         page->xlp_pageaddr.xlogid = 0;
5188         page->xlp_pageaddr.xrecoff = XLogSegSize;
5189         longpage = (XLogLongPageHeader) page;
5190         longpage->xlp_sysid = sysidentifier;
5191         longpage->xlp_seg_size = XLogSegSize;
5192         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5193
5194         /* Insert the initial checkpoint record */
5195         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
5196         record->xl_prev.xlogid = 0;
5197         record->xl_prev.xrecoff = 0;
5198         record->xl_xid = InvalidTransactionId;
5199         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
5200         record->xl_len = sizeof(checkPoint);
5201         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5202         record->xl_rmid = RM_XLOG_ID;
5203         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
5204
5205         INIT_CRC32(crc);
5206         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
5207         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
5208                            SizeOfXLogRecord - sizeof(pg_crc32));
5209         FIN_CRC32(crc);
5210         record->xl_crc = crc;
5211
5212         /* Create first XLOG segment file */
5213         use_existent = false;
5214         openLogFile = XLogFileInit(0, 1, &use_existent, false);
5215
5216         /* Write the first page with the initial record */
5217         errno = 0;
5218         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5219         {
5220                 /* if write didn't set errno, assume problem is no disk space */
5221                 if (errno == 0)
5222                         errno = ENOSPC;
5223                 ereport(PANIC,
5224                                 (errcode_for_file_access(),
5225                           errmsg("could not write bootstrap transaction log file: %m")));
5226         }
5227
5228         if (pg_fsync(openLogFile) != 0)
5229                 ereport(PANIC,
5230                                 (errcode_for_file_access(),
5231                           errmsg("could not fsync bootstrap transaction log file: %m")));
5232
5233         if (close(openLogFile))
5234                 ereport(PANIC,
5235                                 (errcode_for_file_access(),
5236                           errmsg("could not close bootstrap transaction log file: %m")));
5237
5238         openLogFile = -1;
5239
5240         /* Now create pg_control */
5241
5242         memset(ControlFile, 0, sizeof(ControlFileData));
5243         /* Initialize pg_control status fields */
5244         ControlFile->system_identifier = sysidentifier;
5245         ControlFile->state = DB_SHUTDOWNED;
5246         ControlFile->time = checkPoint.time;
5247         ControlFile->checkPoint = checkPoint.redo;
5248         ControlFile->checkPointCopy = checkPoint;
5249
5250         /* Set important parameter values for use when replaying WAL */
5251         ControlFile->MaxConnections = MaxConnections;
5252         ControlFile->max_prepared_xacts = max_prepared_xacts;
5253         ControlFile->max_locks_per_xact = max_locks_per_xact;
5254         ControlFile->wal_level = wal_level;
5255
5256         /* some additional ControlFile fields are set in WriteControlFile() */
5257
5258         WriteControlFile();
5259
5260         /* Bootstrap the commit log, too */
5261         BootStrapCLOG();
5262         BootStrapSUBTRANS();
5263         BootStrapMultiXact();
5264
5265         pfree(buffer);
5266 }
5267
5268 static char *
5269 str_time(pg_time_t tnow)
5270 {
5271         static char buf[128];
5272
5273         pg_strftime(buf, sizeof(buf),
5274                                 "%Y-%m-%d %H:%M:%S %Z",
5275                                 pg_localtime(&tnow, log_timezone));
5276
5277         return buf;
5278 }
5279
5280 /*
5281  * See if there is a recovery command file (recovery.conf), and if so
5282  * read in parameters for archive recovery and XLOG streaming.
5283  *
5284  * The file is parsed using the main configuration parser.
5285  */
5286 static void
5287 readRecoveryCommandFile(void)
5288 {
5289         FILE       *fd;
5290         TimeLineID      rtli = 0;
5291         bool            rtliGiven = false;
5292         ConfigVariable *item,
5293                            *head = NULL,
5294                            *tail = NULL;
5295
5296         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5297         if (fd == NULL)
5298         {
5299                 if (errno == ENOENT)
5300                         return;                         /* not there, so no archive recovery */
5301                 ereport(FATAL,
5302                                 (errcode_for_file_access(),
5303                                  errmsg("could not open recovery command file \"%s\": %m",
5304                                                 RECOVERY_COMMAND_FILE)));
5305         }
5306
5307         /*
5308          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5309          * no need to check the return value.
5310          */
5311         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5312
5313         FreeFile(fd);
5314
5315         for (item = head; item; item = item->next)
5316         {
5317                 if (strcmp(item->name, "restore_command") == 0)
5318                 {
5319                         recoveryRestoreCommand = pstrdup(item->value);
5320                         ereport(DEBUG2,
5321                                         (errmsg_internal("restore_command = '%s'",
5322                                                                          recoveryRestoreCommand)));
5323                 }
5324                 else if (strcmp(item->name, "recovery_end_command") == 0)
5325                 {
5326                         recoveryEndCommand = pstrdup(item->value);
5327                         ereport(DEBUG2,
5328                                         (errmsg_internal("recovery_end_command = '%s'",
5329                                                                          recoveryEndCommand)));
5330                 }
5331                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5332                 {
5333                         archiveCleanupCommand = pstrdup(item->value);
5334                         ereport(DEBUG2,
5335                                         (errmsg_internal("archive_cleanup_command = '%s'",
5336                                                                          archiveCleanupCommand)));
5337                 }
5338                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
5339                 {
5340                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
5341                                 ereport(ERROR,
5342                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5343                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
5344                         ereport(DEBUG2,
5345                                         (errmsg_internal("pause_at_recovery_target = '%s'",
5346                                                                          item->value)));
5347                 }
5348                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5349                 {
5350                         rtliGiven = true;
5351                         if (strcmp(item->value, "latest") == 0)
5352                                 rtli = 0;
5353                         else
5354                         {
5355                                 errno = 0;
5356                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5357                                 if (errno == EINVAL || errno == ERANGE)
5358                                         ereport(FATAL,
5359                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5360                                                                         item->value)));
5361                         }
5362                         if (rtli)
5363                                 ereport(DEBUG2,
5364                                                 (errmsg_internal("recovery_target_timeline = %u", rtli)));
5365                         else
5366                                 ereport(DEBUG2,
5367                                                 (errmsg_internal("recovery_target_timeline = latest")));
5368                 }
5369                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5370                 {
5371                         errno = 0;
5372                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5373                         if (errno == EINVAL || errno == ERANGE)
5374                                 ereport(FATAL,
5375                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5376                                                  item->value)));
5377                         ereport(DEBUG2,
5378                                         (errmsg_internal("recovery_target_xid = %u",
5379                                                                          recoveryTargetXid)));
5380                         recoveryTarget = RECOVERY_TARGET_XID;
5381                 }
5382                 else if (strcmp(item->name, "recovery_target_time") == 0)
5383                 {
5384                         /*
5385                          * if recovery_target_xid or recovery_target_name specified, then
5386                          * this overrides recovery_target_time
5387                          */
5388                         if (recoveryTarget == RECOVERY_TARGET_XID ||
5389                                 recoveryTarget == RECOVERY_TARGET_NAME)
5390                                 continue;
5391                         recoveryTarget = RECOVERY_TARGET_TIME;
5392
5393                         /*
5394                          * Convert the time string given by the user to TimestampTz form.
5395                          */
5396                         recoveryTargetTime =
5397                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5398                                                                                                 CStringGetDatum(item->value),
5399                                                                                                 ObjectIdGetDatum(InvalidOid),
5400                                                                                                                 Int32GetDatum(-1)));
5401                         ereport(DEBUG2,
5402                                         (errmsg_internal("recovery_target_time = '%s'",
5403                                                                          timestamptz_to_str(recoveryTargetTime))));
5404                 }
5405                 else if (strcmp(item->name, "recovery_target_name") == 0)
5406                 {
5407                         /*
5408                          * if recovery_target_xid specified, then this overrides
5409                          * recovery_target_name
5410                          */
5411                         if (recoveryTarget == RECOVERY_TARGET_XID)
5412                                 continue;
5413                         recoveryTarget = RECOVERY_TARGET_NAME;
5414
5415                         recoveryTargetName = pstrdup(item->value);
5416                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5417                                 ereport(FATAL,
5418                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5419                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5420                                                                 MAXFNAMELEN - 1)));
5421
5422                         ereport(DEBUG2,
5423                                         (errmsg_internal("recovery_target_name = '%s'",
5424                                                                          recoveryTargetName)));
5425                 }
5426                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5427                 {
5428                         /*
5429                          * does nothing if a recovery_target is not also set
5430                          */
5431                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5432                                 ereport(ERROR,
5433                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5434                                                  errmsg("parameter \"%s\" requires a Boolean value",
5435                                                                 "recovery_target_inclusive")));
5436                         ereport(DEBUG2,
5437                                         (errmsg_internal("recovery_target_inclusive = %s",
5438                                                                          item->value)));
5439                 }
5440                 else if (strcmp(item->name, "standby_mode") == 0)
5441                 {
5442                         if (!parse_bool(item->value, &StandbyMode))
5443                                 ereport(ERROR,
5444                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5445                                                  errmsg("parameter \"%s\" requires a Boolean value",
5446                                                                 "standby_mode")));
5447                         ereport(DEBUG2,
5448                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5449                 }
5450                 else if (strcmp(item->name, "primary_conninfo") == 0)
5451                 {
5452                         PrimaryConnInfo = pstrdup(item->value);
5453                         ereport(DEBUG2,
5454                                         (errmsg_internal("primary_conninfo = '%s'",
5455                                                                          PrimaryConnInfo)));
5456                 }
5457                 else if (strcmp(item->name, "trigger_file") == 0)
5458                 {
5459                         TriggerFile = pstrdup(item->value);
5460                         ereport(DEBUG2,
5461                                         (errmsg_internal("trigger_file = '%s'",
5462                                                                          TriggerFile)));
5463                 }
5464                 else
5465                         ereport(FATAL,
5466                                         (errmsg("unrecognized recovery parameter \"%s\"",
5467                                                         item->name)));
5468         }
5469
5470         /*
5471          * Check for compulsory parameters
5472          */
5473         if (StandbyMode)
5474         {
5475                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5476                         ereport(WARNING,
5477                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5478                                                         RECOVERY_COMMAND_FILE),
5479                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5480         }
5481         else
5482         {
5483                 if (recoveryRestoreCommand == NULL)
5484                         ereport(FATAL,
5485                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5486                                                         RECOVERY_COMMAND_FILE)));
5487         }
5488
5489         /* Enable fetching from archive recovery area */
5490         InArchiveRecovery = true;
5491
5492         /*
5493          * If user specified recovery_target_timeline, validate it or compute the
5494          * "latest" value.      We can't do this until after we've gotten the restore
5495          * command and set InArchiveRecovery, because we need to fetch timeline
5496          * history files from the archive.
5497          */
5498         if (rtliGiven)
5499         {
5500                 if (rtli)
5501                 {
5502                         /* Timeline 1 does not have a history file, all else should */
5503                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5504                                 ereport(FATAL,
5505                                                 (errmsg("recovery target timeline %u does not exist",
5506                                                                 rtli)));
5507                         recoveryTargetTLI = rtli;
5508                         recoveryTargetIsLatest = false;
5509                 }
5510                 else
5511                 {
5512                         /* We start the "latest" search from pg_control's timeline */
5513                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5514                         recoveryTargetIsLatest = true;
5515                 }
5516         }
5517
5518         FreeConfigVariables(head);
5519 }
5520
5521 /*
5522  * Exit archive-recovery state
5523  */
5524 static void
5525 exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
5526 {
5527         char            recoveryPath[MAXPGPATH];
5528         char            xlogpath[MAXPGPATH];
5529
5530         /*
5531          * We are no longer in archive recovery state.
5532          */
5533         InArchiveRecovery = false;
5534
5535         /*
5536          * Update min recovery point one last time.
5537          */
5538         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5539
5540         /*
5541          * If the ending log segment is still open, close it (to avoid problems on
5542          * Windows with trying to rename or delete an open file).
5543          */
5544         if (readFile >= 0)
5545         {
5546                 close(readFile);
5547                 readFile = -1;
5548         }
5549
5550         /*
5551          * If we are establishing a new timeline, we have to copy data from
5552          * the last WAL segment of the old timeline to create a starting WAL
5553          * segment for the new timeline.
5554          *
5555          * Notify the archiver that the last WAL segment of the old timeline
5556          * is ready to copy to archival storage. Otherwise, it is not archived
5557          * for a while.
5558          */
5559         if (endTLI != ThisTimeLineID)
5560         {
5561                 XLogFileCopy(endLogId, endLogSeg,
5562                                          endTLI, endLogId, endLogSeg);
5563
5564                 if (XLogArchivingActive())
5565                 {
5566                         XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
5567                         XLogArchiveNotify(xlogpath);
5568                 }
5569         }
5570
5571         /*
5572          * Let's just make real sure there are not .ready or .done flags posted
5573          * for the new segment.
5574          */
5575         XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
5576         XLogArchiveCleanup(xlogpath);
5577
5578         /*
5579          * Since there might be a partial WAL segment named RECOVERYXLOG,
5580          * get rid of it.
5581          */
5582         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5583         unlink(recoveryPath);           /* ignore any error */
5584
5585         /* Get rid of any remaining recovered timeline-history file, too */
5586         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5587         unlink(recoveryPath);           /* ignore any error */
5588
5589         /*
5590          * Rename the config file out of the way, so that we don't accidentally
5591          * re-enter archive recovery mode in a subsequent crash.
5592          */
5593         unlink(RECOVERY_COMMAND_DONE);
5594         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5595                 ereport(FATAL,
5596                                 (errcode_for_file_access(),
5597                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5598                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5599
5600         ereport(LOG,
5601                         (errmsg("archive recovery complete")));
5602 }
5603
5604 /*
5605  * For point-in-time recovery, this function decides whether we want to
5606  * stop applying the XLOG at or after the current record.
5607  *
5608  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
5609  * *includeThis is set TRUE if we should apply this record before stopping.
5610  *
5611  * We also track the timestamp of the latest applied COMMIT/ABORT
5612  * record in XLogCtl->recoveryLastXTime, for logging purposes.
5613  * Also, some information is saved in recoveryStopXid et al for use in
5614  * annotating the new timeline's history file.
5615  */
5616 static bool
5617 recoveryStopsHere(XLogRecord *record, bool *includeThis)
5618 {
5619         bool            stopsHere;
5620         uint8           record_info;
5621         TimestampTz recordXtime;
5622         char            recordRPName[MAXFNAMELEN];
5623
5624         /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
5625         if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
5626                 return false;
5627         record_info = record->xl_info & ~XLR_INFO_MASK;
5628         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5629         {
5630                 xl_xact_commit_compact *recordXactCommitData;
5631
5632                 recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
5633                 recordXtime = recordXactCommitData->xact_time;
5634         }
5635         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5636         {
5637                 xl_xact_commit *recordXactCommitData;
5638
5639                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
5640                 recordXtime = recordXactCommitData->xact_time;
5641         }
5642         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5643         {
5644                 xl_xact_abort *recordXactAbortData;
5645
5646                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
5647                 recordXtime = recordXactAbortData->xact_time;
5648         }
5649         else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5650         {
5651                 xl_restore_point *recordRestorePointData;
5652
5653                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5654                 recordXtime = recordRestorePointData->rp_time;
5655                 strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
5656         }
5657         else
5658                 return false;
5659
5660         /* Do we have a PITR target at all? */
5661         if (recoveryTarget == RECOVERY_TARGET_UNSET)
5662         {
5663                 /*
5664                  * Save timestamp of latest transaction commit/abort if this is a
5665                  * transaction record
5666                  */
5667                 if (record->xl_rmid == RM_XACT_ID)
5668                         SetLatestXTime(recordXtime);
5669                 return false;
5670         }
5671
5672         if (recoveryTarget == RECOVERY_TARGET_XID)
5673         {
5674                 /*
5675                  * There can be only one transaction end record with this exact
5676                  * transactionid
5677                  *
5678                  * when testing for an xid, we MUST test for equality only, since
5679                  * transactions are numbered in the order they start, not the order
5680                  * they complete. A higher numbered xid will complete before you about
5681                  * 50% of the time...
5682                  */
5683                 stopsHere = (record->xl_xid == recoveryTargetXid);
5684                 if (stopsHere)
5685                         *includeThis = recoveryTargetInclusive;
5686         }
5687         else if (recoveryTarget == RECOVERY_TARGET_NAME)
5688         {
5689                 /*
5690                  * There can be many restore points that share the same name, so we
5691                  * stop at the first one
5692                  */
5693                 stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
5694
5695                 /*
5696                  * Ignore recoveryTargetInclusive because this is not a transaction
5697                  * record
5698                  */
5699                 *includeThis = false;
5700         }
5701         else
5702         {
5703                 /*
5704                  * There can be many transactions that share the same commit time, so
5705                  * we stop after the last one, if we are inclusive, or stop at the
5706                  * first one if we are exclusive
5707                  */
5708                 if (recoveryTargetInclusive)
5709                         stopsHere = (recordXtime > recoveryTargetTime);
5710                 else
5711                         stopsHere = (recordXtime >= recoveryTargetTime);
5712                 if (stopsHere)
5713                         *includeThis = false;
5714         }
5715
5716         if (stopsHere)
5717         {
5718                 recoveryStopXid = record->xl_xid;
5719                 recoveryStopTime = recordXtime;
5720                 recoveryStopAfter = *includeThis;
5721
5722                 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5723                 {
5724                         if (recoveryStopAfter)
5725                                 ereport(LOG,
5726                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5727                                                                 recoveryStopXid,
5728                                                                 timestamptz_to_str(recoveryStopTime))));
5729                         else
5730                                 ereport(LOG,
5731                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
5732                                                                 recoveryStopXid,
5733                                                                 timestamptz_to_str(recoveryStopTime))));
5734                 }
5735                 else if (record_info == XLOG_XACT_ABORT)
5736                 {
5737                         if (recoveryStopAfter)
5738                                 ereport(LOG,
5739                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5740                                                                 recoveryStopXid,
5741                                                                 timestamptz_to_str(recoveryStopTime))));
5742                         else
5743                                 ereport(LOG,
5744                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
5745                                                                 recoveryStopXid,
5746                                                                 timestamptz_to_str(recoveryStopTime))));
5747                 }
5748                 else
5749                 {
5750                         strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
5751
5752                         ereport(LOG,
5753                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5754                                                 recoveryStopName,
5755                                                 timestamptz_to_str(recoveryStopTime))));
5756                 }
5757
5758                 /*
5759                  * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
5760                  * restore point since they are timestamped, though the latest
5761                  * transaction time is not updated.
5762                  */
5763                 if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
5764                         SetLatestXTime(recordXtime);
5765         }
5766         else if (record->xl_rmid == RM_XACT_ID)
5767                 SetLatestXTime(recordXtime);
5768
5769         return stopsHere;
5770 }
5771
5772 /*
5773  * Recheck shared recoveryPause by polling.
5774  *
5775  * XXX Can also be done with shared latch.
5776  */
5777 static void
5778 recoveryPausesHere(void)
5779 {
5780         ereport(LOG,
5781                         (errmsg("recovery has paused"),
5782                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5783
5784         while (RecoveryIsPaused())
5785         {
5786                 pg_usleep(1000000L);    /* 1000 ms */
5787                 HandleStartupProcInterrupts();
5788         }
5789 }
5790
5791 bool
5792 RecoveryIsPaused(void)
5793 {
5794         /* use volatile pointer to prevent code rearrangement */
5795         volatile XLogCtlData *xlogctl = XLogCtl;
5796         bool            recoveryPause;
5797
5798         SpinLockAcquire(&xlogctl->info_lck);
5799         recoveryPause = xlogctl->recoveryPause;
5800         SpinLockRelease(&xlogctl->info_lck);
5801
5802         return recoveryPause;
5803 }
5804
5805 void
5806 SetRecoveryPause(bool recoveryPause)
5807 {
5808         /* use volatile pointer to prevent code rearrangement */
5809         volatile XLogCtlData *xlogctl = XLogCtl;
5810
5811         SpinLockAcquire(&xlogctl->info_lck);
5812         xlogctl->recoveryPause = recoveryPause;
5813         SpinLockRelease(&xlogctl->info_lck);
5814 }
5815
5816 /*
5817  * Save timestamp of latest processed commit/abort record.
5818  *
5819  * We keep this in XLogCtl, not a simple static variable, so that it can be
5820  * seen by processes other than the startup process.  Note in particular
5821  * that CreateRestartPoint is executed in the checkpointer.
5822  */
5823 static void
5824 SetLatestXTime(TimestampTz xtime)
5825 {
5826         /* use volatile pointer to prevent code rearrangement */
5827         volatile XLogCtlData *xlogctl = XLogCtl;
5828
5829         SpinLockAcquire(&xlogctl->info_lck);
5830         xlogctl->recoveryLastXTime = xtime;
5831         SpinLockRelease(&xlogctl->info_lck);
5832 }
5833
5834 /*
5835  * Fetch timestamp of latest processed commit/abort record.
5836  */
5837 TimestampTz
5838 GetLatestXTime(void)
5839 {
5840         /* use volatile pointer to prevent code rearrangement */
5841         volatile XLogCtlData *xlogctl = XLogCtl;
5842         TimestampTz xtime;
5843
5844         SpinLockAcquire(&xlogctl->info_lck);
5845         xtime = xlogctl->recoveryLastXTime;
5846         SpinLockRelease(&xlogctl->info_lck);
5847
5848         return xtime;
5849 }
5850
5851 /*
5852  * Save timestamp of the next chunk of WAL records to apply.
5853  *
5854  * We keep this in XLogCtl, not a simple static variable, so that it can be
5855  * seen by all backends.
5856  */
5857 static void
5858 SetCurrentChunkStartTime(TimestampTz xtime)
5859 {
5860         /* use volatile pointer to prevent code rearrangement */
5861         volatile XLogCtlData *xlogctl = XLogCtl;
5862
5863         SpinLockAcquire(&xlogctl->info_lck);
5864         xlogctl->currentChunkStartTime = xtime;
5865         SpinLockRelease(&xlogctl->info_lck);
5866 }
5867
5868 /*
5869  * Fetch timestamp of latest processed commit/abort record.
5870  * Startup process maintains an accurate local copy in XLogReceiptTime
5871  */
5872 TimestampTz
5873 GetCurrentChunkReplayStartTime(void)
5874 {
5875         /* use volatile pointer to prevent code rearrangement */
5876         volatile XLogCtlData *xlogctl = XLogCtl;
5877         TimestampTz xtime;
5878
5879         SpinLockAcquire(&xlogctl->info_lck);
5880         xtime = xlogctl->currentChunkStartTime;
5881         SpinLockRelease(&xlogctl->info_lck);
5882
5883         return xtime;
5884 }
5885
5886 /*
5887  * Returns time of receipt of current chunk of XLOG data, as well as
5888  * whether it was received from streaming replication or from archives.
5889  */
5890 void
5891 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
5892 {
5893         /*
5894          * This must be executed in the startup process, since we don't export the
5895          * relevant state to shared memory.
5896          */
5897         Assert(InRecovery);
5898
5899         *rtime = XLogReceiptTime;
5900         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
5901 }
5902
5903 /*
5904  * Note that text field supplied is a parameter name and does not require
5905  * translation
5906  */
5907 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5908 do { \
5909         if (currValue < minValue) \
5910                 ereport(ERROR, \
5911                                 (errmsg("hot standby is not possible because " \
5912                                                 "%s = %d is a lower setting than on the master server " \
5913                                                 "(its value was %d)", \
5914                                                 param_name, \
5915                                                 currValue, \
5916                                                 minValue))); \
5917 } while(0)
5918
5919 /*
5920  * Check to see if required parameters are set high enough on this server
5921  * for various aspects of recovery operation.
5922  */
5923 static void
5924 CheckRequiredParameterValues(void)
5925 {
5926         /*
5927          * For archive recovery, the WAL must be generated with at least 'archive'
5928          * wal_level.
5929          */
5930         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5931         {
5932                 ereport(WARNING,
5933                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
5934                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5935         }
5936
5937         /*
5938          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
5939          * we must have at least as many backend slots as the primary.
5940          */
5941         if (InArchiveRecovery && EnableHotStandby)
5942         {
5943                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
5944                         ereport(ERROR,
5945                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
5946                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5947
5948                 /* We ignore autovacuum_max_workers when we make this test. */
5949                 RecoveryRequiresIntParameter("max_connections",
5950                                                                          MaxConnections,
5951                                                                          ControlFile->MaxConnections);
5952                 RecoveryRequiresIntParameter("max_prepared_xacts",
5953                                                                          max_prepared_xacts,
5954                                                                          ControlFile->max_prepared_xacts);
5955                 RecoveryRequiresIntParameter("max_locks_per_xact",
5956                                                                          max_locks_per_xact,
5957                                                                          ControlFile->max_locks_per_xact);
5958         }
5959 }
5960
5961 /*
5962  * This must be called ONCE during postmaster or standalone-backend startup
5963  */
5964 void
5965 StartupXLOG(void)
5966 {
5967         XLogCtlInsert *Insert;
5968         CheckPoint      checkPoint;
5969         bool            wasShutdown;
5970         bool            reachedStopPoint = false;
5971         bool            haveBackupLabel = false;
5972         XLogRecPtr      RecPtr,
5973                                 checkPointLoc,
5974                                 EndOfLog;
5975         uint32          endLogId;
5976         uint32          endLogSeg;
5977         XLogRecord *record;
5978         uint32          freespace;
5979         TransactionId oldestActiveXID;
5980         bool            backupEndRequired = false;
5981         bool            backupFromStandby = false;
5982         DBState         dbstate_at_startup;
5983
5984         /*
5985          * Read control file and check XLOG status looks valid.
5986          *
5987          * Note: in most control paths, *ControlFile is already valid and we need
5988          * not do ReadControlFile() here, but might as well do it to be sure.
5989          */
5990         ReadControlFile();
5991
5992         if (ControlFile->state < DB_SHUTDOWNED ||
5993                 ControlFile->state > DB_IN_PRODUCTION ||
5994                 !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
5995                 ereport(FATAL,
5996                                 (errmsg("control file contains invalid data")));
5997
5998         if (ControlFile->state == DB_SHUTDOWNED)
5999                 ereport(LOG,
6000                                 (errmsg("database system was shut down at %s",
6001                                                 str_time(ControlFile->time))));
6002         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6003                 ereport(LOG,
6004                                 (errmsg("database system was shut down in recovery at %s",
6005                                                 str_time(ControlFile->time))));
6006         else if (ControlFile->state == DB_SHUTDOWNING)
6007                 ereport(LOG,
6008                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6009                                                 str_time(ControlFile->time))));
6010         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6011                 ereport(LOG,
6012                    (errmsg("database system was interrupted while in recovery at %s",
6013                                    str_time(ControlFile->time)),
6014                         errhint("This probably means that some data is corrupted and"
6015                                         " you will have to use the last backup for recovery.")));
6016         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6017                 ereport(LOG,
6018                                 (errmsg("database system was interrupted while in recovery at log time %s",
6019                                                 str_time(ControlFile->checkPointCopy.time)),
6020                                  errhint("If this has occurred more than once some data might be corrupted"
6021                           " and you might need to choose an earlier recovery target.")));
6022         else if (ControlFile->state == DB_IN_PRODUCTION)
6023                 ereport(LOG,
6024                           (errmsg("database system was interrupted; last known up at %s",
6025                                           str_time(ControlFile->time))));
6026
6027         /* This is just to allow attaching to startup process with a debugger */
6028 #ifdef XLOG_REPLAY_DELAY
6029         if (ControlFile->state != DB_SHUTDOWNED)
6030                 pg_usleep(60000000L);
6031 #endif
6032
6033         /*
6034          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
6035          * someone has performed a copy for PITR, these directories may have been
6036          * excluded and need to be re-created.
6037          */
6038         ValidateXLOGDirectoryStructure();
6039
6040         /*
6041          * Clear out any old relcache cache files.      This is *necessary* if we do
6042          * any WAL replay, since that would probably result in the cache files
6043          * being out of sync with database reality.  In theory we could leave them
6044          * in place if the database had been cleanly shut down, but it seems
6045          * safest to just remove them always and let them be rebuilt during the
6046          * first backend startup.
6047          */
6048         RelationCacheInitFileRemove();
6049
6050         /*
6051          * Initialize on the assumption we want to recover to the same timeline
6052          * that's active according to pg_control.
6053          */
6054         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6055
6056         /*
6057          * Check for recovery control file, and if so set up state for offline
6058          * recovery
6059          */
6060         readRecoveryCommandFile();
6061
6062         /* Now we can determine the list of expected TLIs */
6063         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
6064
6065         /*
6066          * If pg_control's timeline is not in expectedTLIs, then we cannot
6067          * proceed: the backup is not part of the history of the requested
6068          * timeline.
6069          */
6070         if (!list_member_int(expectedTLIs,
6071                                                  (int) ControlFile->checkPointCopy.ThisTimeLineID))
6072                 ereport(FATAL,
6073                                 (errmsg("requested timeline %u is not a child of database system timeline %u",
6074                                                 recoveryTargetTLI,
6075                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
6076
6077         /*
6078          * Save the selected recovery target timeline ID and
6079          * archive_cleanup_command in shared memory so that other processes can
6080          * see them
6081          */
6082         XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
6083         strncpy(XLogCtl->archiveCleanupCommand,
6084                         archiveCleanupCommand ? archiveCleanupCommand : "",
6085                         sizeof(XLogCtl->archiveCleanupCommand));
6086
6087         if (InArchiveRecovery)
6088         {
6089                 if (StandbyMode)
6090                         ereport(LOG,
6091                                         (errmsg("entering standby mode")));
6092                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6093                         ereport(LOG,
6094                                         (errmsg("starting point-in-time recovery to XID %u",
6095                                                         recoveryTargetXid)));
6096                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6097                         ereport(LOG,
6098                                         (errmsg("starting point-in-time recovery to %s",
6099                                                         timestamptz_to_str(recoveryTargetTime))));
6100                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6101                         ereport(LOG,
6102                                         (errmsg("starting point-in-time recovery to \"%s\"",
6103                                                         recoveryTargetName)));
6104                 else
6105                         ereport(LOG,
6106                                         (errmsg("starting archive recovery")));
6107         }
6108
6109         /*
6110          * Take ownership of the wakeup latch if we're going to sleep during
6111          * recovery.
6112          */
6113         if (StandbyMode)
6114                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6115
6116         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6117                                                   &backupFromStandby))
6118         {
6119                 /*
6120                  * When a backup_label file is present, we want to roll forward from
6121                  * the checkpoint it identifies, rather than using pg_control.
6122                  */
6123                 record = ReadCheckpointRecord(checkPointLoc, 0);
6124                 if (record != NULL)
6125                 {
6126                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6127                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6128                         ereport(DEBUG1,
6129                                         (errmsg("checkpoint record is at %X/%X",
6130                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
6131                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6132
6133                         /*
6134                          * Make sure that REDO location exists. This may not be the case
6135                          * if there was a crash during an online backup, which left a
6136                          * backup_label around that references a WAL segment that's
6137                          * already been archived.
6138                          */
6139                         if (XLByteLT(checkPoint.redo, checkPointLoc))
6140                         {
6141                                 if (!ReadRecord(&(checkPoint.redo), LOG, false))
6142                                         ereport(FATAL,
6143                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6144                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6145                         }
6146                 }
6147                 else
6148                 {
6149                         ereport(FATAL,
6150                                         (errmsg("could not locate required checkpoint record"),
6151                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6152                         wasShutdown = false;    /* keep compiler quiet */
6153                 }
6154                 /* set flag to delete it later */
6155                 haveBackupLabel = true;
6156         }
6157         else
6158         {
6159                 /*
6160                  * Get the last valid checkpoint record.  If the latest one according
6161                  * to pg_control is broken, try the next-to-last one.
6162                  */
6163                 checkPointLoc = ControlFile->checkPoint;
6164                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6165                 record = ReadCheckpointRecord(checkPointLoc, 1);
6166                 if (record != NULL)
6167                 {
6168                         ereport(DEBUG1,
6169                                         (errmsg("checkpoint record is at %X/%X",
6170                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
6171                 }
6172                 else if (StandbyMode)
6173                 {
6174                         /*
6175                          * The last valid checkpoint record required for a streaming
6176                          * recovery exists in neither standby nor the primary.
6177                          */
6178                         ereport(PANIC,
6179                                         (errmsg("could not locate a valid checkpoint record")));
6180                 }
6181                 else
6182                 {
6183                         checkPointLoc = ControlFile->prevCheckPoint;
6184                         record = ReadCheckpointRecord(checkPointLoc, 2);
6185                         if (record != NULL)
6186                         {
6187                                 ereport(LOG,
6188                                                 (errmsg("using previous checkpoint record at %X/%X",
6189                                                           checkPointLoc.xlogid, checkPointLoc.xrecoff)));
6190                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6191                         }
6192                         else
6193                                 ereport(PANIC,
6194                                          (errmsg("could not locate a valid checkpoint record")));
6195                 }
6196                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6197                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6198         }
6199
6200         LastRec = RecPtr = checkPointLoc;
6201
6202         ereport(DEBUG1,
6203                         (errmsg("redo record is at %X/%X; shutdown %s",
6204                                         checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
6205                                         wasShutdown ? "TRUE" : "FALSE")));
6206         ereport(DEBUG1,
6207                         (errmsg("next transaction ID: %u/%u; next OID: %u",
6208                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
6209                                         checkPoint.nextOid)));
6210         ereport(DEBUG1,
6211                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6212                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6213         ereport(DEBUG1,
6214                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6215                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
6216         if (!TransactionIdIsNormal(checkPoint.nextXid))
6217                 ereport(PANIC,
6218                                 (errmsg("invalid next transaction ID")));
6219
6220         ShmemVariableCache->nextXid = checkPoint.nextXid;
6221         ShmemVariableCache->nextOid = checkPoint.nextOid;
6222         ShmemVariableCache->oidCount = 0;
6223         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6224         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6225
6226         /*
6227          * We must replay WAL entries using the same TimeLineID they were created
6228          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6229          * also xlog_redo()).
6230          */
6231         ThisTimeLineID = checkPoint.ThisTimeLineID;
6232
6233         lastFullPageWrites = checkPoint.fullPageWrites;
6234
6235         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6236
6237         if (XLByteLT(RecPtr, checkPoint.redo))
6238                 ereport(PANIC,
6239                                 (errmsg("invalid redo in checkpoint record")));
6240
6241         /*
6242          * Check whether we need to force recovery from WAL.  If it appears to
6243          * have been a clean shutdown and we did not have a recovery.conf file,
6244          * then assume no recovery needed.
6245          */
6246         if (XLByteLT(checkPoint.redo, RecPtr))
6247         {
6248                 if (wasShutdown)
6249                         ereport(PANIC,
6250                                         (errmsg("invalid redo record in shutdown checkpoint")));
6251                 InRecovery = true;
6252         }
6253         else if (ControlFile->state != DB_SHUTDOWNED)
6254                 InRecovery = true;
6255         else if (InArchiveRecovery)
6256         {
6257                 /* force recovery due to presence of recovery.conf */
6258                 InRecovery = true;
6259         }
6260
6261         /* REDO */
6262         if (InRecovery)
6263         {
6264                 int                     rmid;
6265
6266                 /* use volatile pointer to prevent code rearrangement */
6267                 volatile XLogCtlData *xlogctl = XLogCtl;
6268
6269                 /*
6270                  * Update pg_control to show that we are recovering and to show the
6271                  * selected checkpoint as the place we are starting from. We also mark
6272                  * pg_control with any minimum recovery stop point obtained from a
6273                  * backup history file.
6274                  */
6275                 dbstate_at_startup = ControlFile->state;
6276                 if (InArchiveRecovery)
6277                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6278                 else
6279                 {
6280                         ereport(LOG,
6281                                         (errmsg("database system was not properly shut down; "
6282                                                         "automatic recovery in progress")));
6283                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6284                 }
6285                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6286                 ControlFile->checkPoint = checkPointLoc;
6287                 ControlFile->checkPointCopy = checkPoint;
6288                 if (InArchiveRecovery)
6289                 {
6290                         /* initialize minRecoveryPoint if not set yet */
6291                         if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo))
6292                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6293                 }
6294
6295                 /*
6296                  * Set backupStartPoint if we're starting recovery from a base backup.
6297                  *
6298                  * Set backupEndPoint and use minRecoveryPoint as the backup end location
6299                  * if we're starting recovery from a base backup which was taken from
6300                  * the standby. In this case, the database system status in pg_control must
6301                  * indicate DB_IN_ARCHIVE_RECOVERY. If not, which means that backup
6302                  * is corrupted, so we cancel recovery.
6303                  */
6304                 if (haveBackupLabel)
6305                 {
6306                         ControlFile->backupStartPoint = checkPoint.redo;
6307                         ControlFile->backupEndRequired = backupEndRequired;
6308
6309                         if (backupFromStandby)
6310                         {
6311                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6312                                         ereport(FATAL,
6313                                                         (errmsg("backup_label contains inconsistent data with control file"),
6314                                                          errhint("This means that the backup is corrupted and you will "
6315                                                                          "have to use another backup for recovery.")));
6316                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6317                         }
6318                 }
6319                 ControlFile->time = (pg_time_t) time(NULL);
6320                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6321                 UpdateControlFile();
6322
6323                 /* initialize our local copy of minRecoveryPoint */
6324                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6325
6326                 /*
6327                  * Reset pgstat data, because it may be invalid after recovery.
6328                  */
6329                 pgstat_reset_all();
6330
6331                 /*
6332                  * If there was a backup label file, it's done its job and the info
6333                  * has now been propagated into pg_control.  We must get rid of the
6334                  * label file so that if we crash during recovery, we'll pick up at
6335                  * the latest recovery restartpoint instead of going all the way back
6336                  * to the backup start point.  It seems prudent though to just rename
6337                  * the file out of the way rather than delete it completely.
6338                  */
6339                 if (haveBackupLabel)
6340                 {
6341                         unlink(BACKUP_LABEL_OLD);
6342                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6343                                 ereport(FATAL,
6344                                                 (errcode_for_file_access(),
6345                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6346                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6347                 }
6348
6349                 /* Check that the GUCs used to generate the WAL allow recovery */
6350                 CheckRequiredParameterValues();
6351
6352                 /*
6353                  * We're in recovery, so unlogged relations relations may be trashed
6354                  * and must be reset.  This should be done BEFORE allowing Hot Standby
6355                  * connections, so that read-only backends don't try to read whatever
6356                  * garbage is left over from before.
6357                  */
6358                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6359
6360                 /*
6361                  * Likewise, delete any saved transaction snapshot files that got
6362                  * left behind by crashed backends.
6363                  */
6364                 DeleteAllExportedSnapshotFiles();
6365
6366                 /*
6367                  * Initialize for Hot Standby, if enabled. We won't let backends in
6368                  * yet, not until we've reached the min recovery point specified in
6369                  * control file and we've established a recovery snapshot from a
6370                  * running-xacts WAL record.
6371                  */
6372                 if (InArchiveRecovery && EnableHotStandby)
6373                 {
6374                         TransactionId *xids;
6375                         int                     nxids;
6376
6377                         ereport(DEBUG1,
6378                                         (errmsg("initializing for hot standby")));
6379
6380                         InitRecoveryTransactionEnvironment();
6381
6382                         if (wasShutdown)
6383                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6384                         else
6385                                 oldestActiveXID = checkPoint.oldestActiveXid;
6386                         Assert(TransactionIdIsValid(oldestActiveXID));
6387
6388                         /*
6389                          * Startup commit log and subtrans only. Other SLRUs are not
6390                          * maintained during recovery and need not be started yet.
6391                          */
6392                         StartupCLOG();
6393                         StartupSUBTRANS(oldestActiveXID);
6394
6395                         /*
6396                          * If we're beginning at a shutdown checkpoint, we know that
6397                          * nothing was running on the master at this point. So fake-up an
6398                          * empty running-xacts record and use that here and now. Recover
6399                          * additional standby state for prepared transactions.
6400                          */
6401                         if (wasShutdown)
6402                         {
6403                                 RunningTransactionsData running;
6404                                 TransactionId latestCompletedXid;
6405
6406                                 /*
6407                                  * Construct a RunningTransactions snapshot representing a
6408                                  * shut down server, with only prepared transactions still
6409                                  * alive. We're never overflowed at this point because all
6410                                  * subxids are listed with their parent prepared transactions.
6411                                  */
6412                                 running.xcnt = nxids;
6413                                 running.subxid_overflow = false;
6414                                 running.nextXid = checkPoint.nextXid;
6415                                 running.oldestRunningXid = oldestActiveXID;
6416                                 latestCompletedXid = checkPoint.nextXid;
6417                                 TransactionIdRetreat(latestCompletedXid);
6418                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6419                                 running.latestCompletedXid = latestCompletedXid;
6420                                 running.xids = xids;
6421
6422                                 ProcArrayApplyRecoveryInfo(&running);
6423
6424                                 StandbyRecoverPreparedTransactions(false);
6425                         }
6426                 }
6427
6428                 /* Initialize resource managers */
6429                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6430                 {
6431                         if (RmgrTable[rmid].rm_startup != NULL)
6432                                 RmgrTable[rmid].rm_startup();
6433                 }
6434
6435                 /*
6436                  * Initialize shared replayEndRecPtr, recoveryLastRecPtr, and
6437                  * recoveryLastXTime.
6438                  *
6439                  * This is slightly confusing if we're starting from an online
6440                  * checkpoint; we've just read and replayed the chekpoint record, but
6441                  * we're going to start replay from its redo pointer, which precedes
6442                  * the location of the checkpoint record itself. So even though the
6443                  * last record we've replayed is indeed ReadRecPtr, we haven't
6444                  * replayed all the preceding records yet. That's OK for the current
6445                  * use of these variables.
6446                  */
6447                 SpinLockAcquire(&xlogctl->info_lck);
6448                 xlogctl->replayEndRecPtr = ReadRecPtr;
6449                 xlogctl->recoveryLastRecPtr = EndRecPtr;
6450                 xlogctl->recoveryLastXTime = 0;
6451                 xlogctl->currentChunkStartTime = 0;
6452                 xlogctl->recoveryPause = false;
6453                 SpinLockRelease(&xlogctl->info_lck);
6454
6455                 /* Also ensure XLogReceiptTime has a sane value */
6456                 XLogReceiptTime = GetCurrentTimestamp();
6457
6458                 /*
6459                  * Let postmaster know we've started redo now, so that it can launch
6460                  * checkpointer to perform restartpoints.  We don't bother during crash
6461                  * recovery as restartpoints can only be performed during archive
6462                  * recovery.  And we'd like to keep crash recovery simple, to avoid
6463                  * introducing bugs that could affect you when recovering after crash.
6464                  *
6465                  * After this point, we can no longer assume that we're the only
6466                  * process in addition to postmaster!  Also, fsync requests are
6467                  * subsequently to be handled by the checkpointer, not locally.
6468                  */
6469                 if (InArchiveRecovery && IsUnderPostmaster)
6470                 {
6471                         PublishStartupProcessInformation();
6472                         SetForwardFsyncRequests();
6473                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6474                         bgwriterLaunched = true;
6475                 }
6476
6477                 /*
6478                  * Allow read-only connections immediately if we're consistent
6479                  * already.
6480                  */
6481                 CheckRecoveryConsistency();
6482
6483                 /*
6484                  * Find the first record that logically follows the checkpoint --- it
6485                  * might physically precede it, though.
6486                  */
6487                 if (XLByteLT(checkPoint.redo, RecPtr))
6488                 {
6489                         /* back up to find the record */
6490                         record = ReadRecord(&(checkPoint.redo), PANIC, false);
6491                 }
6492                 else
6493                 {
6494                         /* just have to read next record after CheckPoint */
6495                         record = ReadRecord(NULL, LOG, false);
6496                 }
6497
6498                 if (record != NULL)
6499                 {
6500                         bool            recoveryContinue = true;
6501                         bool            recoveryApply = true;
6502                         bool            recoveryPause = false;
6503                         ErrorContextCallback errcontext;
6504                         TimestampTz xtime;
6505
6506                         InRedo = true;
6507
6508                         ereport(LOG,
6509                                         (errmsg("redo starts at %X/%X",
6510                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6511
6512                         /*
6513                          * main redo apply loop
6514                          */
6515                         do
6516                         {
6517 #ifdef WAL_DEBUG
6518                                 if (XLOG_DEBUG ||
6519                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6520                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6521                                 {
6522                                         StringInfoData buf;
6523
6524                                         initStringInfo(&buf);
6525                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6526                                                                          ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
6527                                                                          EndRecPtr.xlogid, EndRecPtr.xrecoff);
6528                                         xlog_outrec(&buf, record);
6529                                         appendStringInfo(&buf, " - ");
6530                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
6531                                                                                                            record->xl_info,
6532                                                                                                          XLogRecGetData(record));
6533                                         elog(LOG, "%s", buf.data);
6534                                         pfree(buf.data);
6535                                 }
6536 #endif
6537
6538                                 /* Handle interrupt signals of startup process */
6539                                 HandleStartupProcInterrupts();
6540
6541                                 /* Allow read-only connections if we're consistent now */
6542                                 CheckRecoveryConsistency();
6543
6544                                 /*
6545                                  * Have we reached our recovery target?
6546                                  */
6547                                 if (recoveryStopsHere(record, &recoveryApply))
6548                                 {
6549                                         /*
6550                                          * Pause only if users can connect to send a resume
6551                                          * message
6552                                          */
6553                                         if (recoveryPauseAtTarget && standbyState == STANDBY_SNAPSHOT_READY)
6554                                         {
6555                                                 SetRecoveryPause(true);
6556                                                 recoveryPausesHere();
6557                                         }
6558                                         reachedStopPoint = true;        /* see below */
6559                                         recoveryContinue = false;
6560                                         if (!recoveryApply)
6561                                                 break;
6562                                 }
6563
6564                                 /* Setup error traceback support for ereport() */
6565                                 errcontext.callback = rm_redo_error_callback;
6566                                 errcontext.arg = (void *) record;
6567                                 errcontext.previous = error_context_stack;
6568                                 error_context_stack = &errcontext;
6569
6570                                 /*
6571                                  * ShmemVariableCache->nextXid must be beyond record's xid.
6572                                  *
6573                                  * We don't expect anyone else to modify nextXid, hence we
6574                                  * don't need to hold a lock while examining it.  We still
6575                                  * acquire the lock to modify it, though.
6576                                  */
6577                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6578                                                                                                  ShmemVariableCache->nextXid))
6579                                 {
6580                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6581                                         ShmemVariableCache->nextXid = record->xl_xid;
6582                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6583                                         LWLockRelease(XidGenLock);
6584                                 }
6585
6586                                 /*
6587                                  * Update shared replayEndRecPtr before replaying this record,
6588                                  * so that XLogFlush will update minRecoveryPoint correctly.
6589                                  */
6590                                 SpinLockAcquire(&xlogctl->info_lck);
6591                                 xlogctl->replayEndRecPtr = EndRecPtr;
6592                                 recoveryPause = xlogctl->recoveryPause;
6593                                 SpinLockRelease(&xlogctl->info_lck);
6594
6595                                 /*
6596                                  * Pause only if users can connect to send a resume message
6597                                  */
6598                                 if (recoveryPause && standbyState == STANDBY_SNAPSHOT_READY)
6599                                         recoveryPausesHere();
6600
6601                                 /*
6602                                  * If we are attempting to enter Hot Standby mode, process
6603                                  * XIDs we see
6604                                  */
6605                                 if (standbyState >= STANDBY_INITIALIZED &&
6606                                         TransactionIdIsValid(record->xl_xid))
6607                                         RecordKnownAssignedTransactionIds(record->xl_xid);
6608
6609                                 /* Now apply the WAL record itself */
6610                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
6611
6612                                 /* Pop the error context stack */
6613                                 error_context_stack = errcontext.previous;
6614
6615                                 if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
6616                                         XLByteLE(ControlFile->backupEndPoint, EndRecPtr))
6617                                 {
6618                                         /*
6619                                          * We have reached the end of base backup, the point where
6620                                          * the minimum recovery point in pg_control indicates.
6621                                          * The data on disk is now consistent. Reset backupStartPoint
6622                                          * and backupEndPoint.
6623                                          */
6624                                         elog(DEBUG1, "end of backup reached");
6625
6626                                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6627
6628                                         MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
6629                                         MemSet(&ControlFile->backupEndPoint, 0, sizeof(XLogRecPtr));
6630                                         ControlFile->backupEndRequired = false;
6631                                         UpdateControlFile();
6632
6633                                         LWLockRelease(ControlFileLock);
6634                                 }
6635
6636                                 /*
6637                                  * Update shared recoveryLastRecPtr after this record has been
6638                                  * replayed.
6639                                  */
6640                                 SpinLockAcquire(&xlogctl->info_lck);
6641                                 xlogctl->recoveryLastRecPtr = EndRecPtr;
6642                                 SpinLockRelease(&xlogctl->info_lck);
6643
6644                                 LastRec = ReadRecPtr;
6645
6646                                 record = ReadRecord(NULL, LOG, false);
6647                         } while (record != NULL && recoveryContinue);
6648
6649                         /*
6650                          * end of main redo apply loop
6651                          */
6652
6653                         ereport(LOG,
6654                                         (errmsg("redo done at %X/%X",
6655                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6656                         xtime = GetLatestXTime();
6657                         if (xtime)
6658                                 ereport(LOG,
6659                                          (errmsg("last completed transaction was at log time %s",
6660                                                          timestamptz_to_str(xtime))));
6661                         InRedo = false;
6662                 }
6663                 else
6664                 {
6665                         /* there are no WAL records following the checkpoint */
6666                         ereport(LOG,
6667                                         (errmsg("redo is not required")));
6668                 }
6669         }
6670
6671         /*
6672          * Kill WAL receiver, if it's still running, before we continue to write
6673          * the startup checkpoint record. It will trump over the checkpoint and
6674          * subsequent records if it's still alive when we start writing WAL.
6675          */
6676         ShutdownWalRcv();
6677
6678         /*
6679          * We don't need the latch anymore. It's not strictly necessary to disown
6680          * it, but let's do it for the sake of tidiness.
6681          */
6682         if (StandbyMode)
6683                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
6684
6685         /*
6686          * We are now done reading the xlog from stream. Turn off streaming
6687          * recovery to force fetching the files (which would be required at end of
6688          * recovery, e.g., timeline history file) from archive or pg_xlog.
6689          */
6690         StandbyMode = false;
6691
6692         /*
6693          * Re-fetch the last valid or last applied record, so we can identify the
6694          * exact endpoint of what we consider the valid portion of WAL.
6695          */
6696         record = ReadRecord(&LastRec, PANIC, false);
6697         EndOfLog = EndRecPtr;
6698         XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
6699
6700         /*
6701          * Complain if we did not roll forward far enough to render the backup
6702          * dump consistent.  Note: it is indeed okay to look at the local variable
6703          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
6704          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
6705          * advanced beyond the WAL we processed.
6706          */
6707         if (InRecovery &&
6708                 (XLByteLT(EndOfLog, minRecoveryPoint) ||
6709                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6710         {
6711                 if (reachedStopPoint)
6712                 {
6713                         /* stopped because of stop request */
6714                         ereport(FATAL,
6715                                         (errmsg("requested recovery stop point is before consistent recovery point")));
6716                 }
6717
6718                 /*
6719                  * Ran off end of WAL before reaching end-of-backup WAL record, or
6720                  * minRecoveryPoint. That's usually a bad sign, indicating that you
6721                  * tried to recover from an online backup but never called
6722                  * pg_stop_backup(), or you didn't archive all the WAL up to that
6723                  * point. However, this also happens in crash recovery, if the system
6724                  * crashes while an online backup is in progress. We must not treat
6725                  * that as an error, or the database will refuse to start up.
6726                  */
6727                 if (InArchiveRecovery || ControlFile->backupEndRequired)
6728                 {
6729                         if (ControlFile->backupEndRequired)
6730                                 ereport(FATAL,
6731                                                 (errmsg("WAL ends before end of online backup"),
6732                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
6733                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6734                                 ereport(FATAL,
6735                                                 (errmsg("WAL ends before end of online backup"),
6736                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
6737                         else
6738                                 ereport(FATAL,
6739                                           (errmsg("WAL ends before consistent recovery point")));
6740                 }
6741         }
6742
6743         /*
6744          * Consider whether we need to assign a new timeline ID.
6745          *
6746          * If we are doing an archive recovery, we always assign a new ID.      This
6747          * handles a couple of issues.  If we stopped short of the end of WAL
6748          * during recovery, then we are clearly generating a new timeline and must
6749          * assign it a unique new ID.  Even if we ran to the end, modifying the
6750          * current last segment is problematic because it may result in trying to
6751          * overwrite an already-archived copy of that segment, and we encourage
6752          * DBAs to make their archive_commands reject that.  We can dodge the
6753          * problem by making the new active segment have a new timeline ID.
6754          *
6755          * In a normal crash recovery, we can just extend the timeline we were in.
6756          */
6757         if (InArchiveRecovery)
6758         {
6759                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
6760                 ereport(LOG,
6761                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
6762                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
6763                                                          curFileTLI, endLogId, endLogSeg);
6764         }
6765
6766         /* Save the selected TimeLineID in shared memory, too */
6767         XLogCtl->ThisTimeLineID = ThisTimeLineID;
6768
6769         /*
6770          * We are now done reading the old WAL.  Turn off archive fetching if it
6771          * was active, and make a writable copy of the last WAL segment. (Note
6772          * that we also have a copy of the last block of the old WAL in readBuf;
6773          * we will use that below.)
6774          */
6775         if (InArchiveRecovery)
6776                 exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
6777
6778         /*
6779          * Prepare to write WAL starting at EndOfLog position, and init xlog
6780          * buffer cache using the block containing the last record from the
6781          * previous incarnation.
6782          */
6783         openLogId = endLogId;
6784         openLogSeg = endLogSeg;
6785         openLogFile = XLogFileOpen(openLogId, openLogSeg);
6786         openLogOff = 0;
6787         Insert = &XLogCtl->Insert;
6788         Insert->PrevRecord = LastRec;
6789         XLogCtl->xlblocks[0].xlogid = openLogId;
6790         XLogCtl->xlblocks[0].xrecoff =
6791                 ((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
6792
6793         /*
6794          * Tricky point here: readBuf contains the *last* block that the LastRec
6795          * record spans, not the one it starts in.      The last block is indeed the
6796          * one we want to use.
6797          */
6798         Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
6799         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
6800         Insert->currpos = (char *) Insert->currpage +
6801                 (EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
6802
6803         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
6804
6805         XLogCtl->LogwrtResult = LogwrtResult;
6806
6807         XLogCtl->LogwrtRqst.Write = EndOfLog;
6808         XLogCtl->LogwrtRqst.Flush = EndOfLog;
6809
6810         freespace = INSERT_FREESPACE(Insert);
6811         if (freespace > 0)
6812         {
6813                 /* Make sure rest of page is zero */
6814                 MemSet(Insert->currpos, 0, freespace);
6815                 XLogCtl->Write.curridx = 0;
6816         }
6817         else
6818         {
6819                 /*
6820                  * Whenever LogwrtResult points to exactly the end of a page,
6821                  * Write.curridx must point to the *next* page (see XLogWrite()).
6822                  *
6823                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
6824                  * this is sufficient.  The first actual attempt to insert a log
6825                  * record will advance the insert state.
6826                  */
6827                 XLogCtl->Write.curridx = NextBufIdx(0);
6828         }
6829
6830         /* Pre-scan prepared transactions to find out the range of XIDs present */
6831         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6832
6833         /*
6834          * Update full_page_writes in shared memory and write an
6835          * XLOG_FPW_CHANGE record before resource manager writes cleanup
6836          * WAL records or checkpoint record is written.
6837          */
6838         Insert->fullPageWrites = lastFullPageWrites;
6839         LocalSetXLogInsertAllowed();
6840         UpdateFullPageWrites();
6841         LocalXLogInsertAllowed = -1;
6842
6843         if (InRecovery)
6844         {
6845                 int                     rmid;
6846
6847                 /*
6848                  * Resource managers might need to write WAL records, eg, to record
6849                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
6850                  * this process only.
6851                  */
6852                 LocalSetXLogInsertAllowed();
6853
6854                 /*
6855                  * Allow resource managers to do any required cleanup.
6856                  */
6857                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6858                 {
6859                         if (RmgrTable[rmid].rm_cleanup != NULL)
6860                                 RmgrTable[rmid].rm_cleanup();
6861                 }
6862
6863                 /* Disallow XLogInsert again */
6864                 LocalXLogInsertAllowed = -1;
6865
6866                 /*
6867                  * Perform a checkpoint to update all our recovery activity to disk.
6868                  *
6869                  * Note that we write a shutdown checkpoint rather than an on-line
6870                  * one. This is not particularly critical, but since we may be
6871                  * assigning a new TLI, using a shutdown checkpoint allows us to have
6872                  * the rule that TLI only changes in shutdown checkpoints, which
6873                  * allows some extra error checking in xlog_redo.
6874                  */
6875                 if (bgwriterLaunched)
6876                         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
6877                                                           CHECKPOINT_IMMEDIATE |
6878                                                           CHECKPOINT_WAIT);
6879                 else
6880                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6881
6882                 /*
6883                  * And finally, execute the recovery_end_command, if any.
6884                  */
6885                 if (recoveryEndCommand)
6886                         ExecuteRecoveryCommand(recoveryEndCommand,
6887                                                                    "recovery_end_command",
6888                                                                    true);
6889         }
6890
6891         /*
6892          * Preallocate additional log files, if wanted.
6893          */
6894         PreallocXlogFiles(EndOfLog);
6895
6896         /*
6897          * Reset initial contents of unlogged relations.  This has to be done
6898          * AFTER recovery is complete so that any unlogged relations created
6899          * during recovery also get picked up.
6900          */
6901         if (InRecovery)
6902                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
6903
6904         /*
6905          * Okay, we're officially UP.
6906          */
6907         InRecovery = false;
6908
6909         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6910         ControlFile->state = DB_IN_PRODUCTION;
6911         ControlFile->time = (pg_time_t) time(NULL);
6912         UpdateControlFile();
6913         LWLockRelease(ControlFileLock);
6914
6915         /* start the archive_timeout timer running */
6916         XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
6917
6918         /* initialize shared-memory copy of latest checkpoint XID/epoch */
6919         XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
6920         XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
6921
6922         /* also initialize latestCompletedXid, to nextXid - 1 */
6923         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6924         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
6925         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
6926         LWLockRelease(ProcArrayLock);
6927
6928         /*
6929          * Start up the commit log and subtrans, if not already done for
6930          * hot standby.
6931          */
6932         if (standbyState == STANDBY_DISABLED)
6933         {
6934                 StartupCLOG();
6935                 StartupSUBTRANS(oldestActiveXID);
6936         }
6937
6938         /*
6939          * Perform end of recovery actions for any SLRUs that need it.
6940          */
6941         StartupMultiXact();
6942         TrimCLOG();
6943
6944         /* Reload shared-memory state for prepared transactions */
6945         RecoverPreparedTransactions();
6946
6947         /*
6948          * Shutdown the recovery environment. This must occur after
6949          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
6950          */
6951         if (standbyState != STANDBY_DISABLED)
6952                 ShutdownRecoveryTransactionEnvironment();
6953
6954         /* Shut down readFile facility, free space */
6955         if (readFile >= 0)
6956         {
6957                 close(readFile);
6958                 readFile = -1;
6959         }
6960         if (readBuf)
6961         {
6962                 free(readBuf);
6963                 readBuf = NULL;
6964         }
6965         if (readRecordBuf)
6966         {
6967                 free(readRecordBuf);
6968                 readRecordBuf = NULL;
6969                 readRecordBufSize = 0;
6970         }
6971
6972         /*
6973          * If any of the critical GUCs have changed, log them before we allow
6974          * backends to write WAL.
6975          */
6976         LocalSetXLogInsertAllowed();
6977         XLogReportParameters();
6978
6979         /*
6980          * All done.  Allow backends to write WAL.      (Although the bool flag is
6981          * probably atomic in itself, we use the info_lck here to ensure that
6982          * there are no race conditions concerning visibility of other recent
6983          * updates to shared memory.)
6984          */
6985         {
6986                 /* use volatile pointer to prevent code rearrangement */
6987                 volatile XLogCtlData *xlogctl = XLogCtl;
6988
6989                 SpinLockAcquire(&xlogctl->info_lck);
6990                 xlogctl->SharedRecoveryInProgress = false;
6991                 SpinLockRelease(&xlogctl->info_lck);
6992         }
6993 }
6994
6995 /*
6996  * Checks if recovery has reached a consistent state. When consistency is
6997  * reached and we have a valid starting standby snapshot, tell postmaster
6998  * that it can start accepting read-only connections.
6999  */
7000 static void
7001 CheckRecoveryConsistency(void)
7002 {
7003         /*
7004          * During crash recovery, we don't reach a consistent state until we've
7005          * replayed all the WAL.
7006          */
7007         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7008                 return;
7009
7010         /*
7011          * Have we passed our safe starting point?
7012          */
7013         if (!reachedConsistency &&
7014                 XLByteLE(minRecoveryPoint, EndRecPtr) &&
7015                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7016         {
7017                 /*
7018                  * Check to see if the XLOG sequence contained any unresolved
7019                  * references to uninitialized pages.
7020                  */
7021                 XLogCheckInvalidPages();
7022
7023                 reachedConsistency = true;
7024                 ereport(LOG,
7025                                 (errmsg("consistent recovery state reached at %X/%X",
7026                                                 EndRecPtr.xlogid, EndRecPtr.xrecoff)));
7027         }
7028
7029         /*
7030          * Have we got a valid starting snapshot that will allow queries to be
7031          * run? If so, we can tell postmaster that the database is consistent now,
7032          * enabling connections.
7033          */
7034         if (standbyState == STANDBY_SNAPSHOT_READY &&
7035                 !LocalHotStandbyActive &&
7036                 reachedConsistency &&
7037                 IsUnderPostmaster)
7038         {
7039                 /* use volatile pointer to prevent code rearrangement */
7040                 volatile XLogCtlData *xlogctl = XLogCtl;
7041
7042                 SpinLockAcquire(&xlogctl->info_lck);
7043                 xlogctl->SharedHotStandbyActive = true;
7044                 SpinLockRelease(&xlogctl->info_lck);
7045
7046                 LocalHotStandbyActive = true;
7047
7048                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7049         }
7050 }
7051
7052 /*
7053  * Is the system still in recovery?
7054  *
7055  * Unlike testing InRecovery, this works in any process that's connected to
7056  * shared memory.
7057  *
7058  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7059  * variables the first time we see that recovery is finished.
7060  */
7061 bool
7062 RecoveryInProgress(void)
7063 {
7064         /*
7065          * We check shared state each time only until we leave recovery mode. We
7066          * can't re-enter recovery, so there's no need to keep checking after the
7067          * shared variable has once been seen false.
7068          */
7069         if (!LocalRecoveryInProgress)
7070                 return false;
7071         else
7072         {
7073                 /* use volatile pointer to prevent code rearrangement */
7074                 volatile XLogCtlData *xlogctl = XLogCtl;
7075
7076                 /* spinlock is essential on machines with weak memory ordering! */
7077                 SpinLockAcquire(&xlogctl->info_lck);
7078                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7079                 SpinLockRelease(&xlogctl->info_lck);
7080
7081                 /*
7082                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7083                  * is finished. InitPostgres() relies upon this behaviour to ensure
7084                  * that InitXLOGAccess() is called at backend startup.  (If you change
7085                  * this, see also LocalSetXLogInsertAllowed.)
7086                  */
7087                 if (!LocalRecoveryInProgress)
7088                         InitXLOGAccess();
7089
7090                 return LocalRecoveryInProgress;
7091         }
7092 }
7093
7094 /*
7095  * Is HotStandby active yet? This is only important in special backends
7096  * since normal backends won't ever be able to connect until this returns
7097  * true. Postmaster knows this by way of signal, not via shared memory.
7098  *
7099  * Unlike testing standbyState, this works in any process that's connected to
7100  * shared memory.
7101  */
7102 bool
7103 HotStandbyActive(void)
7104 {
7105         /*
7106          * We check shared state each time only until Hot Standby is active. We
7107          * can't de-activate Hot Standby, so there's no need to keep checking
7108          * after the shared variable has once been seen true.
7109          */
7110         if (LocalHotStandbyActive)
7111                 return true;
7112         else
7113         {
7114                 /* use volatile pointer to prevent code rearrangement */
7115                 volatile XLogCtlData *xlogctl = XLogCtl;
7116
7117                 /* spinlock is essential on machines with weak memory ordering! */
7118                 SpinLockAcquire(&xlogctl->info_lck);
7119                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
7120                 SpinLockRelease(&xlogctl->info_lck);
7121
7122                 return LocalHotStandbyActive;
7123         }
7124 }
7125
7126 /*
7127  * Is this process allowed to insert new WAL records?
7128  *
7129  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7130  * But we also have provisions for forcing the result "true" or "false"
7131  * within specific processes regardless of the global state.
7132  */
7133 bool
7134 XLogInsertAllowed(void)
7135 {
7136         /*
7137          * If value is "unconditionally true" or "unconditionally false", just
7138          * return it.  This provides the normal fast path once recovery is known
7139          * done.
7140          */
7141         if (LocalXLogInsertAllowed >= 0)
7142                 return (bool) LocalXLogInsertAllowed;
7143
7144         /*
7145          * Else, must check to see if we're still in recovery.
7146          */
7147         if (RecoveryInProgress())
7148                 return false;
7149
7150         /*
7151          * On exit from recovery, reset to "unconditionally true", since there is
7152          * no need to keep checking.
7153          */
7154         LocalXLogInsertAllowed = 1;
7155         return true;
7156 }
7157
7158 /*
7159  * Make XLogInsertAllowed() return true in the current process only.
7160  *
7161  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7162  * and even call LocalSetXLogInsertAllowed() again after that.
7163  */
7164 static void
7165 LocalSetXLogInsertAllowed(void)
7166 {
7167         Assert(LocalXLogInsertAllowed == -1);
7168         LocalXLogInsertAllowed = 1;
7169
7170         /* Initialize as RecoveryInProgress() would do when switching state */
7171         InitXLOGAccess();
7172 }
7173
7174 /*
7175  * Subroutine to try to fetch and validate a prior checkpoint record.
7176  *
7177  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7178  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7179  */
7180 static XLogRecord *
7181 ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
7182 {
7183         XLogRecord *record;
7184
7185         if (!XRecOffIsValid(RecPtr.xrecoff))
7186         {
7187                 switch (whichChkpt)
7188                 {
7189                         case 1:
7190                                 ereport(LOG,
7191                                 (errmsg("invalid primary checkpoint link in control file")));
7192                                 break;
7193                         case 2:
7194                                 ereport(LOG,
7195                                                 (errmsg("invalid secondary checkpoint link in control file")));
7196                                 break;
7197                         default:
7198                                 ereport(LOG,
7199                                    (errmsg("invalid checkpoint link in backup_label file")));
7200                                 break;
7201                 }
7202                 return NULL;
7203         }
7204
7205         record = ReadRecord(&RecPtr, LOG, true);
7206
7207         if (record == NULL)
7208         {
7209                 switch (whichChkpt)
7210                 {
7211                         case 1:
7212                                 ereport(LOG,
7213                                                 (errmsg("invalid primary checkpoint record")));
7214                                 break;
7215                         case 2:
7216                                 ereport(LOG,
7217                                                 (errmsg("invalid secondary checkpoint record")));
7218                                 break;
7219                         default:
7220                                 ereport(LOG,
7221                                                 (errmsg("invalid checkpoint record")));
7222                                 break;
7223                 }
7224                 return NULL;
7225         }
7226         if (record->xl_rmid != RM_XLOG_ID)
7227         {
7228                 switch (whichChkpt)
7229                 {
7230                         case 1:
7231                                 ereport(LOG,
7232                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7233                                 break;
7234                         case 2:
7235                                 ereport(LOG,
7236                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7237                                 break;
7238                         default:
7239                                 ereport(LOG,
7240                                 (errmsg("invalid resource manager ID in checkpoint record")));
7241                                 break;
7242                 }
7243                 return NULL;
7244         }
7245         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7246                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7247         {
7248                 switch (whichChkpt)
7249                 {
7250                         case 1:
7251                                 ereport(LOG,
7252                                    (errmsg("invalid xl_info in primary checkpoint record")));
7253                                 break;
7254                         case 2:
7255                                 ereport(LOG,
7256                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7257                                 break;
7258                         default:
7259                                 ereport(LOG,
7260                                                 (errmsg("invalid xl_info in checkpoint record")));
7261                                 break;
7262                 }
7263                 return NULL;
7264         }
7265         if (record->xl_len != sizeof(CheckPoint) ||
7266                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
7267         {
7268                 switch (whichChkpt)
7269                 {
7270                         case 1:
7271                                 ereport(LOG,
7272                                         (errmsg("invalid length of primary checkpoint record")));
7273                                 break;
7274                         case 2:
7275                                 ereport(LOG,
7276                                   (errmsg("invalid length of secondary checkpoint record")));
7277                                 break;
7278                         default:
7279                                 ereport(LOG,
7280                                                 (errmsg("invalid length of checkpoint record")));
7281                                 break;
7282                 }
7283                 return NULL;
7284         }
7285         return record;
7286 }
7287
7288 /*
7289  * This must be called during startup of a backend process, except that
7290  * it need not be called in a standalone backend (which does StartupXLOG
7291  * instead).  We need to initialize the local copies of ThisTimeLineID and
7292  * RedoRecPtr.
7293  *
7294  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7295  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7296  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7297  */
7298 void
7299 InitXLOGAccess(void)
7300 {
7301         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7302         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7303         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7304
7305         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7306         (void) GetRedoRecPtr();
7307 }
7308
7309 /*
7310  * Once spawned, a backend may update its local RedoRecPtr from
7311  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
7312  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
7313  */
7314 XLogRecPtr
7315 GetRedoRecPtr(void)
7316 {
7317         /* use volatile pointer to prevent code rearrangement */
7318         volatile XLogCtlData *xlogctl = XLogCtl;
7319
7320         SpinLockAcquire(&xlogctl->info_lck);
7321         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
7322         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
7323         SpinLockRelease(&xlogctl->info_lck);
7324
7325         return RedoRecPtr;
7326 }
7327
7328 /*
7329  * GetInsertRecPtr -- Returns the current insert position.
7330  *
7331  * NOTE: The value *actually* returned is the position of the last full
7332  * xlog page. It lags behind the real insert position by at most 1 page.
7333  * For that, we don't need to acquire WALInsertLock which can be quite
7334  * heavily contended, and an approximation is enough for the current
7335  * usage of this function.
7336  */
7337 XLogRecPtr
7338 GetInsertRecPtr(void)
7339 {
7340         /* use volatile pointer to prevent code rearrangement */
7341         volatile XLogCtlData *xlogctl = XLogCtl;
7342         XLogRecPtr      recptr;
7343
7344         SpinLockAcquire(&xlogctl->info_lck);
7345         recptr = xlogctl->LogwrtRqst.Write;
7346         SpinLockRelease(&xlogctl->info_lck);
7347
7348         return recptr;
7349 }
7350
7351 /*
7352  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7353  * position known to be fsync'd to disk.
7354  */
7355 XLogRecPtr
7356 GetFlushRecPtr(void)
7357 {
7358         /* use volatile pointer to prevent code rearrangement */
7359         volatile XLogCtlData *xlogctl = XLogCtl;
7360         XLogRecPtr      recptr;
7361
7362         SpinLockAcquire(&xlogctl->info_lck);
7363         recptr = xlogctl->LogwrtResult.Flush;
7364         SpinLockRelease(&xlogctl->info_lck);
7365
7366         return recptr;
7367 }
7368
7369 /*
7370  * Get the time of the last xlog segment switch
7371  */
7372 pg_time_t
7373 GetLastSegSwitchTime(void)
7374 {
7375         pg_time_t       result;
7376
7377         /* Need WALWriteLock, but shared lock is sufficient */
7378         LWLockAcquire(WALWriteLock, LW_SHARED);
7379         result = XLogCtl->Write.lastSegSwitchTime;
7380         LWLockRelease(WALWriteLock);
7381
7382         return result;
7383 }
7384
7385 /*
7386  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7387  *
7388  * This is exported for use by code that would like to have 64-bit XIDs.
7389  * We don't really support such things, but all XIDs within the system
7390  * can be presumed "close to" the result, and thus the epoch associated
7391  * with them can be determined.
7392  */
7393 void
7394 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7395 {
7396         uint32          ckptXidEpoch;
7397         TransactionId ckptXid;
7398         TransactionId nextXid;
7399
7400         /* Must read checkpoint info first, else have race condition */
7401         {
7402                 /* use volatile pointer to prevent code rearrangement */
7403                 volatile XLogCtlData *xlogctl = XLogCtl;
7404
7405                 SpinLockAcquire(&xlogctl->info_lck);
7406                 ckptXidEpoch = xlogctl->ckptXidEpoch;
7407                 ckptXid = xlogctl->ckptXid;
7408                 SpinLockRelease(&xlogctl->info_lck);
7409         }
7410
7411         /* Now fetch current nextXid */
7412         nextXid = ReadNewTransactionId();
7413
7414         /*
7415          * nextXid is certainly logically later than ckptXid.  So if it's
7416          * numerically less, it must have wrapped into the next epoch.
7417          */
7418         if (nextXid < ckptXid)
7419                 ckptXidEpoch++;
7420
7421         *xid = nextXid;
7422         *epoch = ckptXidEpoch;
7423 }
7424
7425 /*
7426  * GetRecoveryTargetTLI - get the recovery target timeline ID
7427  */
7428 TimeLineID
7429 GetRecoveryTargetTLI(void)
7430 {
7431         /* RecoveryTargetTLI doesn't change so we need no lock to copy it */
7432         return XLogCtl->RecoveryTargetTLI;
7433 }
7434
7435 /*
7436  * This must be called ONCE during postmaster or standalone-backend shutdown
7437  */
7438 void
7439 ShutdownXLOG(int code, Datum arg)
7440 {
7441         ereport(LOG,
7442                         (errmsg("shutting down")));
7443
7444         if (RecoveryInProgress())
7445                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7446         else
7447         {
7448                 /*
7449                  * If archiving is enabled, rotate the last XLOG file so that all the
7450                  * remaining records are archived (postmaster wakes up the archiver
7451                  * process one more time at the end of shutdown). The checkpoint
7452                  * record will go to the next XLOG file and won't be archived (yet).
7453                  */
7454                 if (XLogArchivingActive() && XLogArchiveCommandSet())
7455                         RequestXLogSwitch();
7456
7457                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7458         }
7459         ShutdownCLOG();
7460         ShutdownSUBTRANS();
7461         ShutdownMultiXact();
7462
7463         ereport(LOG,
7464                         (errmsg("database system is shut down")));
7465 }
7466
7467 /*
7468  * Log start of a checkpoint.
7469  */
7470 static void
7471 LogCheckpointStart(int flags, bool restartpoint)
7472 {
7473         const char *msg;
7474
7475         /*
7476          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
7477          * the main message, but what about all the flags?
7478          */
7479         if (restartpoint)
7480                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
7481         else
7482                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
7483
7484         elog(LOG, msg,
7485                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7486                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7487                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
7488                  (flags & CHECKPOINT_FORCE) ? " force" : "",
7489                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
7490                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
7491                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
7492 }
7493
7494 /*
7495  * Log end of a checkpoint.
7496  */
7497 static void
7498 LogCheckpointEnd(bool restartpoint)
7499 {
7500         long            write_secs,
7501                                 sync_secs,
7502                                 total_secs,
7503                                 longest_secs,
7504                                 average_secs;
7505         int                     write_usecs,
7506                                 sync_usecs,
7507                                 total_usecs,
7508                                 longest_usecs,
7509                                 average_usecs;
7510         uint64          average_sync_time;
7511
7512         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
7513
7514         TimestampDifference(CheckpointStats.ckpt_start_t,
7515                                                 CheckpointStats.ckpt_end_t,
7516                                                 &total_secs, &total_usecs);
7517
7518         TimestampDifference(CheckpointStats.ckpt_write_t,
7519                                                 CheckpointStats.ckpt_sync_t,
7520                                                 &write_secs, &write_usecs);
7521
7522         TimestampDifference(CheckpointStats.ckpt_sync_t,
7523                                                 CheckpointStats.ckpt_sync_end_t,
7524                                                 &sync_secs, &sync_usecs);
7525
7526         /*
7527          * Timing values returned from CheckpointStats are in microseconds.
7528          * Convert to the second plus microsecond form that TimestampDifference
7529          * returns for homogeneous printing.
7530          */
7531         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
7532         longest_usecs = CheckpointStats.ckpt_longest_sync -
7533                 (uint64) longest_secs *1000000;
7534
7535         average_sync_time = 0;
7536         if (CheckpointStats.ckpt_sync_rels > 0)
7537                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
7538                         CheckpointStats.ckpt_sync_rels;
7539         average_secs = (long) (average_sync_time / 1000000);
7540         average_usecs = average_sync_time - (uint64) average_secs *1000000;
7541
7542         if (restartpoint)
7543                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
7544                          "%d transaction log file(s) added, %d removed, %d recycled; "
7545                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7546                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7547                          CheckpointStats.ckpt_bufs_written,
7548                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7549                          CheckpointStats.ckpt_segs_added,
7550                          CheckpointStats.ckpt_segs_removed,
7551                          CheckpointStats.ckpt_segs_recycled,
7552                          write_secs, write_usecs / 1000,
7553                          sync_secs, sync_usecs / 1000,
7554                          total_secs, total_usecs / 1000,
7555                          CheckpointStats.ckpt_sync_rels,
7556                          longest_secs, longest_usecs / 1000,
7557                          average_secs, average_usecs / 1000);
7558         else
7559                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
7560                          "%d transaction log file(s) added, %d removed, %d recycled; "
7561                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7562                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7563                          CheckpointStats.ckpt_bufs_written,
7564                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7565                          CheckpointStats.ckpt_segs_added,
7566                          CheckpointStats.ckpt_segs_removed,
7567                          CheckpointStats.ckpt_segs_recycled,
7568                          write_secs, write_usecs / 1000,
7569                          sync_secs, sync_usecs / 1000,
7570                          total_secs, total_usecs / 1000,
7571                          CheckpointStats.ckpt_sync_rels,
7572                          longest_secs, longest_usecs / 1000,
7573                          average_secs, average_usecs / 1000);
7574 }
7575
7576 /*
7577  * Perform a checkpoint --- either during shutdown, or on-the-fly
7578  *
7579  * flags is a bitwise OR of the following:
7580  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7581  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7582  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7583  *              ignoring checkpoint_completion_target parameter.
7584  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
7585  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7586  *              CHECKPOINT_END_OF_RECOVERY).
7587  *
7588  * Note: flags contains other bits, of interest here only for logging purposes.
7589  * In particular note that this routine is synchronous and does not pay
7590  * attention to CHECKPOINT_WAIT.
7591  */
7592 void
7593 CreateCheckPoint(int flags)
7594 {
7595         bool            shutdown;
7596         CheckPoint      checkPoint;
7597         XLogRecPtr      recptr;
7598         XLogCtlInsert *Insert = &XLogCtl->Insert;
7599         XLogRecData rdata;
7600         uint32          freespace;
7601         uint32          _logId;
7602         uint32          _logSeg;
7603         uint32          redo_logId;
7604         uint32          redo_logSeg;
7605         uint32          insert_logId;
7606         uint32          insert_logSeg;
7607         TransactionId *inCommitXids;
7608         int                     nInCommit;
7609
7610         /*
7611          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7612          * issued at a different time.
7613          */
7614         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7615                 shutdown = true;
7616         else
7617                 shutdown = false;
7618
7619         /* sanity check */
7620         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7621                 elog(ERROR, "can't create a checkpoint during recovery");
7622
7623         /*
7624          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
7625          * (This is just pro forma, since in the present system structure there is
7626          * only one process that is allowed to issue checkpoints at any given
7627          * time.)
7628          */
7629         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7630
7631         /*
7632          * Prepare to accumulate statistics.
7633          *
7634          * Note: because it is possible for log_checkpoints to change while a
7635          * checkpoint proceeds, we always accumulate stats, even if
7636          * log_checkpoints is currently off.
7637          */
7638         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7639         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7640
7641         /*
7642          * Use a critical section to force system panic if we have trouble.
7643          */
7644         START_CRIT_SECTION();
7645
7646         if (shutdown)
7647         {
7648                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7649                 ControlFile->state = DB_SHUTDOWNING;
7650                 ControlFile->time = (pg_time_t) time(NULL);
7651                 UpdateControlFile();
7652                 LWLockRelease(ControlFileLock);
7653         }
7654
7655         /*
7656          * Let smgr prepare for checkpoint; this has to happen before we determine
7657          * the REDO pointer.  Note that smgr must not do anything that'd have to
7658          * be undone if we decide no checkpoint is needed.
7659          */
7660         smgrpreckpt();
7661
7662         /* Begin filling in the checkpoint WAL record */
7663         MemSet(&checkPoint, 0, sizeof(checkPoint));
7664         checkPoint.time = (pg_time_t) time(NULL);
7665
7666         /*
7667          * For Hot Standby, derive the oldestActiveXid before we fix the redo pointer.
7668          * This allows us to begin accumulating changes to assemble our starting
7669          * snapshot of locks and transactions.
7670          */
7671         if (!shutdown && XLogStandbyInfoActive())
7672                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
7673         else
7674                 checkPoint.oldestActiveXid = InvalidTransactionId;
7675
7676         /*
7677          * We must hold WALInsertLock while examining insert state to determine
7678          * the checkpoint REDO pointer.
7679          */
7680         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7681
7682         /*
7683          * If this isn't a shutdown or forced checkpoint, and we have not switched
7684          * to the next WAL file since the start of the last checkpoint, skip the
7685          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
7686          * when the system is idle. That wastes log space, and more importantly it
7687          * exposes us to possible loss of both current and previous checkpoint
7688          * records if the machine crashes just as we're writing the update.
7689          * (Perhaps it'd make even more sense to checkpoint only when the previous
7690          * checkpoint record is in a different xlog page?)
7691          *
7692          * While holding the WALInsertLock we find the current WAL insertion point
7693          * and compare that with the starting point of the last checkpoint, which
7694          * is the redo pointer. We use the redo pointer because the start and end
7695          * points of a checkpoint can be hundreds of files apart on large systems
7696          * when checkpoint writes are spread out over time.
7697          */
7698         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
7699                                   CHECKPOINT_FORCE)) == 0)
7700         {
7701                 XLogRecPtr      curInsert;
7702
7703                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
7704                 XLByteToSeg(curInsert, insert_logId, insert_logSeg);
7705                 XLByteToSeg(ControlFile->checkPointCopy.redo, redo_logId, redo_logSeg);
7706                 if (insert_logId == redo_logId &&
7707                         insert_logSeg == redo_logSeg)
7708                 {
7709                         LWLockRelease(WALInsertLock);
7710                         LWLockRelease(CheckpointLock);
7711                         END_CRIT_SECTION();
7712                         return;
7713                 }
7714         }
7715
7716         /*
7717          * An end-of-recovery checkpoint is created before anyone is allowed to
7718          * write WAL. To allow us to write the checkpoint record, temporarily
7719          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
7720          * initialized, which we need here and in AdvanceXLInsertBuffer.)
7721          */
7722         if (flags & CHECKPOINT_END_OF_RECOVERY)
7723                 LocalSetXLogInsertAllowed();
7724
7725         checkPoint.ThisTimeLineID = ThisTimeLineID;
7726         checkPoint.fullPageWrites = Insert->fullPageWrites;
7727
7728         /*
7729          * Compute new REDO record ptr = location of next XLOG record.
7730          *
7731          * NB: this is NOT necessarily where the checkpoint record itself will be,
7732          * since other backends may insert more XLOG records while we're off doing
7733          * the buffer flush work.  Those XLOG records are logically after the
7734          * checkpoint, even though physically before it.  Got that?
7735          */
7736         freespace = INSERT_FREESPACE(Insert);
7737         if (freespace < SizeOfXLogRecord)
7738         {
7739                 (void) AdvanceXLInsertBuffer(false);
7740                 /* OK to ignore update return flag, since we will do flush anyway */
7741                 freespace = INSERT_FREESPACE(Insert);
7742         }
7743         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
7744
7745         /*
7746          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
7747          * must be done while holding the insert lock AND the info_lck.
7748          *
7749          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
7750          * pointing past where it really needs to point.  This is okay; the only
7751          * consequence is that XLogInsert might back up whole buffers that it
7752          * didn't really need to.  We can't postpone advancing RedoRecPtr because
7753          * XLogInserts that happen while we are dumping buffers must assume that
7754          * their buffer changes are not included in the checkpoint.
7755          */
7756         {
7757                 /* use volatile pointer to prevent code rearrangement */
7758                 volatile XLogCtlData *xlogctl = XLogCtl;
7759
7760                 SpinLockAcquire(&xlogctl->info_lck);
7761                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
7762                 SpinLockRelease(&xlogctl->info_lck);
7763         }
7764
7765         /*
7766          * Now we can release WAL insert lock, allowing other xacts to proceed
7767          * while we are flushing disk buffers.
7768          */
7769         LWLockRelease(WALInsertLock);
7770
7771         /*
7772          * If enabled, log checkpoint start.  We postpone this until now so as not
7773          * to log anything if we decided to skip the checkpoint.
7774          */
7775         if (log_checkpoints)
7776                 LogCheckpointStart(flags, false);
7777
7778         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
7779
7780         /*
7781          * Before flushing data, we must wait for any transactions that are
7782          * currently in their commit critical sections.  If an xact inserted its
7783          * commit record into XLOG just before the REDO point, then a crash
7784          * restart from the REDO point would not replay that record, which means
7785          * that our flushing had better include the xact's update of pg_clog.  So
7786          * we wait till he's out of his commit critical section before proceeding.
7787          * See notes in RecordTransactionCommit().
7788          *
7789          * Because we've already released WALInsertLock, this test is a bit fuzzy:
7790          * it is possible that we will wait for xacts we didn't really need to
7791          * wait for.  But the delay should be short and it seems better to make
7792          * checkpoint take a bit longer than to hold locks longer than necessary.
7793          * (In fact, the whole reason we have this issue is that xact.c does
7794          * commit record XLOG insertion and clog update as two separate steps
7795          * protected by different locks, but again that seems best on grounds of
7796          * minimizing lock contention.)
7797          *
7798          * A transaction that has not yet set inCommit when we look cannot be at
7799          * risk, since he's not inserted his commit record yet; and one that's
7800          * already cleared it is not at risk either, since he's done fixing clog
7801          * and we will correctly flush the update below.  So we cannot miss any
7802          * xacts we need to wait for.
7803          */
7804         nInCommit = GetTransactionsInCommit(&inCommitXids);
7805         if (nInCommit > 0)
7806         {
7807                 do
7808                 {
7809                         pg_usleep(10000L);      /* wait for 10 msec */
7810                 } while (HaveTransactionsInCommit(inCommitXids, nInCommit));
7811         }
7812         pfree(inCommitXids);
7813
7814         /*
7815          * Get the other info we need for the checkpoint record.
7816          */
7817         LWLockAcquire(XidGenLock, LW_SHARED);
7818         checkPoint.nextXid = ShmemVariableCache->nextXid;
7819         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
7820         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
7821         LWLockRelease(XidGenLock);
7822
7823         /* Increase XID epoch if we've wrapped around since last checkpoint */
7824         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
7825         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
7826                 checkPoint.nextXidEpoch++;
7827
7828         LWLockAcquire(OidGenLock, LW_SHARED);
7829         checkPoint.nextOid = ShmemVariableCache->nextOid;
7830         if (!shutdown)
7831                 checkPoint.nextOid += ShmemVariableCache->oidCount;
7832         LWLockRelease(OidGenLock);
7833
7834         MultiXactGetCheckptMulti(shutdown,
7835                                                          &checkPoint.nextMulti,
7836                                                          &checkPoint.nextMultiOffset);
7837
7838         /*
7839          * Having constructed the checkpoint record, ensure all shmem disk buffers
7840          * and commit-log buffers are flushed to disk.
7841          *
7842          * This I/O could fail for various reasons.  If so, we will fail to
7843          * complete the checkpoint, but there is no reason to force a system
7844          * panic. Accordingly, exit critical section while doing it.
7845          */
7846         END_CRIT_SECTION();
7847
7848         CheckPointGuts(checkPoint.redo, flags);
7849
7850         /*
7851          * Take a snapshot of running transactions and write this to WAL. This
7852          * allows us to reconstruct the state of running transactions during
7853          * archive recovery, if required. Skip, if this info disabled.
7854          *
7855          * If we are shutting down, or Startup process is completing crash
7856          * recovery we don't need to write running xact data.
7857          *
7858          * Update checkPoint.nextXid since we have a later value
7859          */
7860         if (!shutdown && XLogStandbyInfoActive())
7861                 LogStandbySnapshot(&checkPoint.nextXid);
7862
7863         START_CRIT_SECTION();
7864
7865         /*
7866          * Now insert the checkpoint record into XLOG.
7867          */
7868         rdata.data = (char *) (&checkPoint);
7869         rdata.len = sizeof(checkPoint);
7870         rdata.buffer = InvalidBuffer;
7871         rdata.next = NULL;
7872
7873         recptr = XLogInsert(RM_XLOG_ID,
7874                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7875                                                 XLOG_CHECKPOINT_ONLINE,
7876                                                 &rdata);
7877
7878         XLogFlush(recptr);
7879
7880         /*
7881          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7882          * overwritten at next startup.  No-one should even try, this just allows
7883          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
7884          * to just temporarily disable writing until the system has exited
7885          * recovery.
7886          */
7887         if (shutdown)
7888         {
7889                 if (flags & CHECKPOINT_END_OF_RECOVERY)
7890                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
7891                 else
7892                         LocalXLogInsertAllowed = 0; /* never again write WAL */
7893         }
7894
7895         /*
7896          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7897          * = end of actual checkpoint record.
7898          */
7899         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
7900                 ereport(PANIC,
7901                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
7902
7903         /*
7904          * Select point at which we can truncate the log, which we base on the
7905          * prior checkpoint's earliest info.
7906          */
7907         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
7908
7909         /*
7910          * Update the control file.
7911          */
7912         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7913         if (shutdown)
7914                 ControlFile->state = DB_SHUTDOWNED;
7915         ControlFile->prevCheckPoint = ControlFile->checkPoint;
7916         ControlFile->checkPoint = ProcLastRecPtr;
7917         ControlFile->checkPointCopy = checkPoint;
7918         ControlFile->time = (pg_time_t) time(NULL);
7919         /* crash recovery should always recover to the end of WAL */
7920         MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr));
7921         UpdateControlFile();
7922         LWLockRelease(ControlFileLock);
7923
7924         /* Update shared-memory copy of checkpoint XID/epoch */
7925         {
7926                 /* use volatile pointer to prevent code rearrangement */
7927                 volatile XLogCtlData *xlogctl = XLogCtl;
7928
7929                 SpinLockAcquire(&xlogctl->info_lck);
7930                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
7931                 xlogctl->ckptXid = checkPoint.nextXid;
7932                 SpinLockRelease(&xlogctl->info_lck);
7933         }
7934
7935         /*
7936          * We are now done with critical updates; no need for system panic if we
7937          * have trouble while fooling with old log segments.
7938          */
7939         END_CRIT_SECTION();
7940
7941         /*
7942          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7943          */
7944         smgrpostckpt();
7945
7946         /*
7947          * Delete old log files (those no longer needed even for previous
7948          * checkpoint or the standbys in XLOG streaming).
7949          */
7950         if (_logId || _logSeg)
7951         {
7952                 KeepLogSeg(recptr, &_logId, &_logSeg);
7953                 PrevLogSeg(_logId, _logSeg);
7954                 RemoveOldXlogFiles(_logId, _logSeg, recptr);
7955         }
7956
7957         /*
7958          * Make more log segments if needed.  (Do this after recycling old log
7959          * segments, since that may supply some of the needed files.)
7960          */
7961         if (!shutdown)
7962                 PreallocXlogFiles(recptr);
7963
7964         /*
7965          * Truncate pg_subtrans if possible.  We can throw away all data before
7966          * the oldest XMIN of any running transaction.  No future transaction will
7967          * attempt to reference any pg_subtrans entry older than that (see Asserts
7968          * in subtrans.c).      During recovery, though, we mustn't do this because
7969          * StartupSUBTRANS hasn't been called yet.
7970          */
7971         if (!RecoveryInProgress())
7972                 TruncateSUBTRANS(GetOldestXmin(true, false));
7973
7974         /* All real work is done, but log before releasing lock. */
7975         if (log_checkpoints)
7976                 LogCheckpointEnd(false);
7977
7978         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
7979                                                                          NBuffers,
7980                                                                          CheckpointStats.ckpt_segs_added,
7981                                                                          CheckpointStats.ckpt_segs_removed,
7982                                                                          CheckpointStats.ckpt_segs_recycled);
7983
7984         LWLockRelease(CheckpointLock);
7985 }
7986
7987 /*
7988  * Flush all data in shared memory to disk, and fsync
7989  *
7990  * This is the common code shared between regular checkpoints and
7991  * recovery restartpoints.
7992  */
7993 static void
7994 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7995 {
7996         CheckPointCLOG();
7997         CheckPointSUBTRANS();
7998         CheckPointMultiXact();
7999         CheckPointPredicate();
8000         CheckPointRelationMap();
8001         CheckPointBuffers(flags);       /* performs all required fsyncs */
8002         /* We deliberately delay 2PC checkpointing as long as possible */
8003         CheckPointTwoPhase(checkPointRedo);
8004 }
8005
8006 /*
8007  * Save a checkpoint for recovery restart if appropriate
8008  *
8009  * This function is called each time a checkpoint record is read from XLOG.
8010  * It must determine whether the checkpoint represents a safe restartpoint or
8011  * not.  If so, the checkpoint record is stashed in shared memory so that
8012  * CreateRestartPoint can consult it.  (Note that the latter function is
8013  * executed by the checkpointer, while this one will be executed by the
8014  * startup process.)
8015  */
8016 static void
8017 RecoveryRestartPoint(const CheckPoint *checkPoint)
8018 {
8019         int                     rmid;
8020
8021         /* use volatile pointer to prevent code rearrangement */
8022         volatile XLogCtlData *xlogctl = XLogCtl;
8023
8024         /*
8025          * Is it safe to restartpoint?  We must ask each of the resource managers
8026          * whether they have any partial state information that might prevent a
8027          * correct restart from this point.  If so, we skip this opportunity, but
8028          * return at the next checkpoint record for another try.
8029          */
8030         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
8031         {
8032                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
8033                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
8034                         {
8035                                 elog(trace_recovery(DEBUG2),
8036                                          "RM %d not safe to record restart point at %X/%X",
8037                                          rmid,
8038                                          checkPoint->redo.xlogid,
8039                                          checkPoint->redo.xrecoff);
8040                                 return;
8041                         }
8042         }
8043
8044         /*
8045          * Also refrain from creating a restartpoint if we have seen any references
8046          * to non-existent pages. Restarting recovery from the restartpoint would
8047          * not see the references, so we would lose the cross-check that the pages
8048          * belonged to a relation that was dropped later.
8049          */
8050         if (XLogHaveInvalidPages())
8051         {
8052                 elog(trace_recovery(DEBUG2),
8053                          "could not record restart point at %X/%X because there "
8054                          "are unresolved references to invalid pages",
8055                          checkPoint->redo.xlogid,
8056                          checkPoint->redo.xrecoff);
8057                 return;
8058         }
8059
8060         /*
8061          * Copy the checkpoint record to shared memory, so that checkpointer
8062          * can work out the next time it wants to perform a restartpoint.
8063          */
8064         SpinLockAcquire(&xlogctl->info_lck);
8065         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
8066         memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
8067         SpinLockRelease(&xlogctl->info_lck);
8068 }
8069
8070 /*
8071  * Establish a restartpoint if possible.
8072  *
8073  * This is similar to CreateCheckPoint, but is used during WAL recovery
8074  * to establish a point from which recovery can roll forward without
8075  * replaying the entire recovery log.
8076  *
8077  * Returns true if a new restartpoint was established. We can only establish
8078  * a restartpoint if we have replayed a safe checkpoint record since last
8079  * restartpoint.
8080  */
8081 bool
8082 CreateRestartPoint(int flags)
8083 {
8084         XLogRecPtr      lastCheckPointRecPtr;
8085         CheckPoint      lastCheckPoint;
8086         uint32          _logId;
8087         uint32          _logSeg;
8088         TimestampTz xtime;
8089
8090         /* use volatile pointer to prevent code rearrangement */
8091         volatile XLogCtlData *xlogctl = XLogCtl;
8092
8093         /*
8094          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8095          * happens at a time.
8096          */
8097         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8098
8099         /* Get a local copy of the last safe checkpoint record. */
8100         SpinLockAcquire(&xlogctl->info_lck);
8101         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
8102         memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
8103         SpinLockRelease(&xlogctl->info_lck);
8104
8105         /*
8106          * Check that we're still in recovery mode. It's ok if we exit recovery
8107          * mode after this check, the restart point is valid anyway.
8108          */
8109         if (!RecoveryInProgress())
8110         {
8111                 ereport(DEBUG2,
8112                           (errmsg("skipping restartpoint, recovery has already ended")));
8113                 LWLockRelease(CheckpointLock);
8114                 return false;
8115         }
8116
8117         /*
8118          * If the last checkpoint record we've replayed is already our last
8119          * restartpoint, we can't perform a new restart point. We still update
8120          * minRecoveryPoint in that case, so that if this is a shutdown restart
8121          * point, we won't start up earlier than before. That's not strictly
8122          * necessary, but when hot standby is enabled, it would be rather weird if
8123          * the database opened up for read-only connections at a point-in-time
8124          * before the last shutdown. Such time travel is still possible in case of
8125          * immediate shutdown, though.
8126          *
8127          * We don't explicitly advance minRecoveryPoint when we do create a
8128          * restartpoint. It's assumed that flushing the buffers will do that as a
8129          * side-effect.
8130          */
8131         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8132                 XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
8133         {
8134                 ereport(DEBUG2,
8135                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8136                                   lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
8137
8138                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8139                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8140                 {
8141                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8142                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8143                         ControlFile->time = (pg_time_t) time(NULL);
8144                         UpdateControlFile();
8145                         LWLockRelease(ControlFileLock);
8146                 }
8147                 LWLockRelease(CheckpointLock);
8148                 return false;
8149         }
8150
8151         /*
8152          * Update the shared RedoRecPtr so that the startup process can calculate
8153          * the number of segments replayed since last restartpoint, and request a
8154          * restartpoint if it exceeds checkpoint_segments.
8155          *
8156          * You need to hold WALInsertLock and info_lck to update it, although
8157          * during recovery acquiring WALInsertLock is just pro forma, because
8158          * there is no other processes updating Insert.RedoRecPtr.
8159          */
8160         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8161         SpinLockAcquire(&xlogctl->info_lck);
8162         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
8163         SpinLockRelease(&xlogctl->info_lck);
8164         LWLockRelease(WALInsertLock);
8165
8166         /*
8167          * Prepare to accumulate statistics.
8168          *
8169          * Note: because it is possible for log_checkpoints to change while a
8170          * checkpoint proceeds, we always accumulate stats, even if
8171          * log_checkpoints is currently off.
8172          */
8173         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8174         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8175
8176         if (log_checkpoints)
8177                 LogCheckpointStart(flags, true);
8178
8179         CheckPointGuts(lastCheckPoint.redo, flags);
8180
8181         /*
8182          * Select point at which we can truncate the xlog, which we base on the
8183          * prior checkpoint's earliest info.
8184          */
8185         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
8186
8187         /*
8188          * Update pg_control, using current time.  Check that it still shows
8189          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8190          * this is a quick hack to make sure nothing really bad happens if somehow
8191          * we get here after the end-of-recovery checkpoint.
8192          */
8193         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8194         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8195                 XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
8196         {
8197                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8198                 ControlFile->checkPoint = lastCheckPointRecPtr;
8199                 ControlFile->checkPointCopy = lastCheckPoint;
8200                 ControlFile->time = (pg_time_t) time(NULL);
8201                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8202                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8203                 UpdateControlFile();
8204         }
8205         LWLockRelease(ControlFileLock);
8206
8207         /*
8208          * Delete old log files (those no longer needed even for previous
8209          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8210          * growing full.
8211          */
8212         if (_logId || _logSeg)
8213         {
8214                 XLogRecPtr      endptr;
8215
8216                 /* Get the current (or recent) end of xlog */
8217                 endptr = GetStandbyFlushRecPtr();
8218
8219                 KeepLogSeg(endptr, &_logId, &_logSeg);
8220                 PrevLogSeg(_logId, _logSeg);
8221                 RemoveOldXlogFiles(_logId, _logSeg, endptr);
8222
8223                 /*
8224                  * Make more log segments if needed.  (Do this after recycling old log
8225                  * segments, since that may supply some of the needed files.)
8226                  */
8227                 PreallocXlogFiles(endptr);
8228         }
8229
8230         /*
8231          * Truncate pg_subtrans if possible.  We can throw away all data before
8232          * the oldest XMIN of any running transaction.  No future transaction will
8233          * attempt to reference any pg_subtrans entry older than that (see Asserts
8234          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
8235          * this because StartupSUBTRANS hasn't been called yet.
8236          */
8237         if (EnableHotStandby)
8238                 TruncateSUBTRANS(GetOldestXmin(true, false));
8239
8240         /* All real work is done, but log before releasing lock. */
8241         if (log_checkpoints)
8242                 LogCheckpointEnd(true);
8243
8244         xtime = GetLatestXTime();
8245         ereport((log_checkpoints ? LOG : DEBUG2),
8246                         (errmsg("recovery restart point at %X/%X",
8247                                         lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff),
8248                    xtime ? errdetail("last completed transaction was at log time %s",
8249                                                          timestamptz_to_str(xtime)) : 0));
8250
8251         LWLockRelease(CheckpointLock);
8252
8253         /*
8254          * Finally, execute archive_cleanup_command, if any.
8255          */
8256         if (XLogCtl->archiveCleanupCommand[0])
8257                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8258                                                            "archive_cleanup_command",
8259                                                            false);
8260
8261         return true;
8262 }
8263
8264 /*
8265  * Calculate the last segment that we need to retain because of
8266  * wal_keep_segments, by subtracting wal_keep_segments from
8267  * the given xlog location, recptr.
8268  */
8269 static void
8270 KeepLogSeg(XLogRecPtr recptr, uint32 *logId, uint32 *logSeg)
8271 {
8272         uint32          log;
8273         uint32          seg;
8274         int                     d_log;
8275         int                     d_seg;
8276
8277         if (wal_keep_segments == 0)
8278                 return;
8279
8280         XLByteToSeg(recptr, log, seg);
8281
8282         d_seg = wal_keep_segments % XLogSegsPerFile;
8283         d_log = wal_keep_segments / XLogSegsPerFile;
8284         if (seg < d_seg)
8285         {
8286                 d_log += 1;
8287                 seg = seg - d_seg + XLogSegsPerFile;
8288         }
8289         else
8290                 seg = seg - d_seg;
8291         /* avoid underflow, don't go below (0,1) */
8292         if (log < d_log || (log == d_log && seg == 0))
8293         {
8294                 log = 0;
8295                 seg = 1;
8296         }
8297         else
8298                 log = log - d_log;
8299
8300         /* don't delete WAL segments newer than the calculated segment */
8301         if (log < *logId || (log == *logId && seg < *logSeg))
8302         {
8303                 *logId = log;
8304                 *logSeg = seg;
8305         }
8306 }
8307
8308 /*
8309  * Write a NEXTOID log record
8310  */
8311 void
8312 XLogPutNextOid(Oid nextOid)
8313 {
8314         XLogRecData rdata;
8315
8316         rdata.data = (char *) (&nextOid);
8317         rdata.len = sizeof(Oid);
8318         rdata.buffer = InvalidBuffer;
8319         rdata.next = NULL;
8320         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
8321
8322         /*
8323          * We need not flush the NEXTOID record immediately, because any of the
8324          * just-allocated OIDs could only reach disk as part of a tuple insert or
8325          * update that would have its own XLOG record that must follow the NEXTOID
8326          * record.      Therefore, the standard buffer LSN interlock applied to those
8327          * records will ensure no such OID reaches disk before the NEXTOID record
8328          * does.
8329          *
8330          * Note, however, that the above statement only covers state "within" the
8331          * database.  When we use a generated OID as a file or directory name, we
8332          * are in a sense violating the basic WAL rule, because that filesystem
8333          * change may reach disk before the NEXTOID WAL record does.  The impact
8334          * of this is that if a database crash occurs immediately afterward, we
8335          * might after restart re-generate the same OID and find that it conflicts
8336          * with the leftover file or directory.  But since for safety's sake we
8337          * always loop until finding a nonconflicting filename, this poses no real
8338          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8339          */
8340 }
8341
8342 /*
8343  * Write an XLOG SWITCH record.
8344  *
8345  * Here we just blindly issue an XLogInsert request for the record.
8346  * All the magic happens inside XLogInsert.
8347  *
8348  * The return value is either the end+1 address of the switch record,
8349  * or the end+1 address of the prior segment if we did not need to
8350  * write a switch record because we are already at segment start.
8351  */
8352 XLogRecPtr
8353 RequestXLogSwitch(void)
8354 {
8355         XLogRecPtr      RecPtr;
8356         XLogRecData rdata;
8357
8358         /* XLOG SWITCH, alone among xlog record types, has no data */
8359         rdata.buffer = InvalidBuffer;
8360         rdata.data = NULL;
8361         rdata.len = 0;
8362         rdata.next = NULL;
8363
8364         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
8365
8366         return RecPtr;
8367 }
8368
8369 /*
8370  * Write a RESTORE POINT record
8371  */
8372 XLogRecPtr
8373 XLogRestorePoint(const char *rpName)
8374 {
8375         XLogRecPtr      RecPtr;
8376         XLogRecData rdata;
8377         xl_restore_point xlrec;
8378
8379         xlrec.rp_time = GetCurrentTimestamp();
8380         strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8381
8382         rdata.buffer = InvalidBuffer;
8383         rdata.data = (char *) &xlrec;
8384         rdata.len = sizeof(xl_restore_point);
8385         rdata.next = NULL;
8386
8387         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
8388
8389         ereport(LOG,
8390                         (errmsg("restore point \"%s\" created at %X/%X",
8391                                         rpName, RecPtr.xlogid, RecPtr.xrecoff)));
8392
8393         return RecPtr;
8394 }
8395
8396 /*
8397  * Check if any of the GUC parameters that are critical for hot standby
8398  * have changed, and update the value in pg_control file if necessary.
8399  */
8400 static void
8401 XLogReportParameters(void)
8402 {
8403         if (wal_level != ControlFile->wal_level ||
8404                 MaxConnections != ControlFile->MaxConnections ||
8405                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
8406                 max_locks_per_xact != ControlFile->max_locks_per_xact)
8407         {
8408                 /*
8409                  * The change in number of backend slots doesn't need to be WAL-logged
8410                  * if archiving is not enabled, as you can't start archive recovery
8411                  * with wal_level=minimal anyway. We don't really care about the
8412                  * values in pg_control either if wal_level=minimal, but seems better
8413                  * to keep them up-to-date to avoid confusion.
8414                  */
8415                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8416                 {
8417                         XLogRecData rdata;
8418                         xl_parameter_change xlrec;
8419
8420                         xlrec.MaxConnections = MaxConnections;
8421                         xlrec.max_prepared_xacts = max_prepared_xacts;
8422                         xlrec.max_locks_per_xact = max_locks_per_xact;
8423                         xlrec.wal_level = wal_level;
8424
8425                         rdata.buffer = InvalidBuffer;
8426                         rdata.data = (char *) &xlrec;
8427                         rdata.len = sizeof(xlrec);
8428                         rdata.next = NULL;
8429
8430                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
8431                 }
8432
8433                 ControlFile->MaxConnections = MaxConnections;
8434                 ControlFile->max_prepared_xacts = max_prepared_xacts;
8435                 ControlFile->max_locks_per_xact = max_locks_per_xact;
8436                 ControlFile->wal_level = wal_level;
8437                 UpdateControlFile();
8438         }
8439 }
8440
8441 /*
8442  * Update full_page_writes in shared memory, and write an
8443  * XLOG_FPW_CHANGE record if necessary.
8444  *
8445  * Note: this function assumes there is no other process running
8446  * concurrently that could update it.
8447  */
8448 void
8449 UpdateFullPageWrites(void)
8450 {
8451         XLogCtlInsert *Insert = &XLogCtl->Insert;
8452
8453         /*
8454          * Do nothing if full_page_writes has not been changed.
8455          *
8456          * It's safe to check the shared full_page_writes without the lock,
8457          * because we assume that there is no concurrently running process
8458          * which can update it.
8459          */
8460         if (fullPageWrites == Insert->fullPageWrites)
8461                 return;
8462
8463         START_CRIT_SECTION();
8464
8465         /*
8466          * It's always safe to take full page images, even when not strictly
8467          * required, but not the other round. So if we're setting full_page_writes
8468          * to true, first set it true and then write the WAL record. If we're
8469          * setting it to false, first write the WAL record and then set the
8470          * global flag.
8471          */
8472         if (fullPageWrites)
8473         {
8474                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8475                 Insert->fullPageWrites = true;
8476                 LWLockRelease(WALInsertLock);
8477         }
8478
8479         /*
8480          * Write an XLOG_FPW_CHANGE record. This allows us to keep
8481          * track of full_page_writes during archive recovery, if required.
8482          */
8483         if (XLogStandbyInfoActive() && !RecoveryInProgress())
8484         {
8485                 XLogRecData     rdata;
8486
8487                 rdata.data = (char *) (&fullPageWrites);
8488                 rdata.len = sizeof(bool);
8489                 rdata.buffer = InvalidBuffer;
8490                 rdata.next = NULL;
8491
8492                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
8493         }
8494
8495         if (!fullPageWrites)
8496         {
8497                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8498                 Insert->fullPageWrites = false;
8499                 LWLockRelease(WALInsertLock);
8500         }
8501         END_CRIT_SECTION();
8502 }
8503
8504 /*
8505  * XLOG resource manager's routines
8506  *
8507  * Definitions of info values are in include/catalog/pg_control.h, though
8508  * not all record types are related to control file updates.
8509  */
8510 void
8511 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
8512 {
8513         uint8           info = record->xl_info & ~XLR_INFO_MASK;
8514
8515         /* Backup blocks are not used in xlog records */
8516         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
8517
8518         if (info == XLOG_NEXTOID)
8519         {
8520                 Oid                     nextOid;
8521
8522                 /*
8523                  * We used to try to take the maximum of ShmemVariableCache->nextOid
8524                  * and the recorded nextOid, but that fails if the OID counter wraps
8525                  * around.  Since no OID allocation should be happening during replay
8526                  * anyway, better to just believe the record exactly.  We still take
8527                  * OidGenLock while setting the variable, just in case.
8528                  */
8529                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
8530                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8531                 ShmemVariableCache->nextOid = nextOid;
8532                 ShmemVariableCache->oidCount = 0;
8533                 LWLockRelease(OidGenLock);
8534         }
8535         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
8536         {
8537                 CheckPoint      checkPoint;
8538
8539                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8540                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
8541                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8542                 ShmemVariableCache->nextXid = checkPoint.nextXid;
8543                 LWLockRelease(XidGenLock);
8544                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8545                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8546                 ShmemVariableCache->oidCount = 0;
8547                 LWLockRelease(OidGenLock);
8548                 MultiXactSetNextMXact(checkPoint.nextMulti,
8549                                                           checkPoint.nextMultiOffset);
8550                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
8551
8552                 /*
8553                  * If we see a shutdown checkpoint while waiting for an end-of-backup
8554                  * record, the backup was canceled and the end-of-backup record will
8555                  * never arrive.
8556                  */
8557                 if (InArchiveRecovery &&
8558                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
8559                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
8560                         ereport(PANIC,
8561                                         (errmsg("online backup was canceled, recovery cannot continue")));
8562
8563                 /*
8564                  * If we see a shutdown checkpoint, we know that nothing was running
8565                  * on the master at this point. So fake-up an empty running-xacts
8566                  * record and use that here and now. Recover additional standby state
8567                  * for prepared transactions.
8568                  */
8569                 if (standbyState >= STANDBY_INITIALIZED)
8570                 {
8571                         TransactionId *xids;
8572                         int                     nxids;
8573                         TransactionId oldestActiveXID;
8574                         TransactionId latestCompletedXid;
8575                         RunningTransactionsData running;
8576
8577                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
8578
8579                         /*
8580                          * Construct a RunningTransactions snapshot representing a shut
8581                          * down server, with only prepared transactions still alive. We're
8582                          * never overflowed at this point because all subxids are listed
8583                          * with their parent prepared transactions.
8584                          */
8585                         running.xcnt = nxids;
8586                         running.subxid_overflow = false;
8587                         running.nextXid = checkPoint.nextXid;
8588                         running.oldestRunningXid = oldestActiveXID;
8589                         latestCompletedXid = checkPoint.nextXid;
8590                         TransactionIdRetreat(latestCompletedXid);
8591                         Assert(TransactionIdIsNormal(latestCompletedXid));
8592                         running.latestCompletedXid = latestCompletedXid;
8593                         running.xids = xids;
8594
8595                         ProcArrayApplyRecoveryInfo(&running);
8596
8597                         StandbyRecoverPreparedTransactions(true);
8598                 }
8599
8600                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8601                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8602                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8603
8604                 /*
8605                  * TLI may change in a shutdown checkpoint, but it shouldn't decrease
8606                  */
8607                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8608                 {
8609                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
8610                                 !list_member_int(expectedTLIs,
8611                                                                  (int) checkPoint.ThisTimeLineID))
8612                                 ereport(PANIC,
8613                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
8614                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
8615                         /* Following WAL records should be run with new TLI */
8616                         ThisTimeLineID = checkPoint.ThisTimeLineID;
8617                 }
8618
8619                 RecoveryRestartPoint(&checkPoint);
8620         }
8621         else if (info == XLOG_CHECKPOINT_ONLINE)
8622         {
8623                 CheckPoint      checkPoint;
8624
8625                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8626                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
8627                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8628                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
8629                                                                   checkPoint.nextXid))
8630                         ShmemVariableCache->nextXid = checkPoint.nextXid;
8631                 LWLockRelease(XidGenLock);
8632                 /* ... but still treat OID counter as exact */
8633                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8634                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8635                 ShmemVariableCache->oidCount = 0;
8636                 LWLockRelease(OidGenLock);
8637                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
8638                                                                   checkPoint.nextMultiOffset);
8639                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
8640                                                                   checkPoint.oldestXid))
8641                         SetTransactionIdLimit(checkPoint.oldestXid,
8642                                                                   checkPoint.oldestXidDB);
8643
8644                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8645                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8646                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8647
8648                 /* TLI should not change in an on-line checkpoint */
8649                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8650                         ereport(PANIC,
8651                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8652                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8653
8654                 RecoveryRestartPoint(&checkPoint);
8655         }
8656         else if (info == XLOG_NOOP)
8657         {
8658                 /* nothing to do here */
8659         }
8660         else if (info == XLOG_SWITCH)
8661         {
8662                 /* nothing to do here */
8663         }
8664         else if (info == XLOG_RESTORE_POINT)
8665         {
8666                 /* nothing to do here */
8667         }
8668         else if (info == XLOG_BACKUP_END)
8669         {
8670                 XLogRecPtr      startpoint;
8671
8672                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
8673
8674                 if (XLByteEQ(ControlFile->backupStartPoint, startpoint))
8675                 {
8676                         /*
8677                          * We have reached the end of base backup, the point where
8678                          * pg_stop_backup() was done. The data on disk is now consistent.
8679                          * Reset backupStartPoint, and update minRecoveryPoint to make
8680                          * sure we don't allow starting up at an earlier point even if
8681                          * recovery is stopped and restarted soon after this.
8682                          */
8683                         elog(DEBUG1, "end of backup reached");
8684
8685                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8686
8687                         if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
8688                                 ControlFile->minRecoveryPoint = lsn;
8689                         MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
8690                         ControlFile->backupEndRequired = false;
8691                         UpdateControlFile();
8692
8693                         LWLockRelease(ControlFileLock);
8694                 }
8695         }
8696         else if (info == XLOG_PARAMETER_CHANGE)
8697         {
8698                 xl_parameter_change xlrec;
8699
8700                 /* Update our copy of the parameters in pg_control */
8701                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8702
8703                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8704                 ControlFile->MaxConnections = xlrec.MaxConnections;
8705                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8706                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8707                 ControlFile->wal_level = xlrec.wal_level;
8708
8709                 /*
8710                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
8711                  * recover back up to this point before allowing hot standby again.
8712                  * This is particularly important if wal_level was set to 'archive'
8713                  * before, and is now 'hot_standby', to ensure you don't run queries
8714                  * against the WAL preceding the wal_level change. Same applies to
8715                  * decreasing max_* settings.
8716                  */
8717                 minRecoveryPoint = ControlFile->minRecoveryPoint;
8718                 if ((minRecoveryPoint.xlogid != 0 || minRecoveryPoint.xrecoff != 0)
8719                         && XLByteLT(minRecoveryPoint, lsn))
8720                 {
8721                         ControlFile->minRecoveryPoint = lsn;
8722                 }
8723
8724                 UpdateControlFile();
8725                 LWLockRelease(ControlFileLock);
8726
8727                 /* Check to see if any changes to max_connections give problems */
8728                 CheckRequiredParameterValues();
8729         }
8730         else if (info == XLOG_FPW_CHANGE)
8731         {
8732                 /* use volatile pointer to prevent code rearrangement */
8733                 volatile XLogCtlData *xlogctl = XLogCtl;
8734                 bool            fpw;
8735
8736                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8737
8738                 /*
8739                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record
8740                  * so that do_pg_start_backup() and do_pg_stop_backup() can check
8741                  * whether full_page_writes has been disabled during online backup.
8742                  */
8743                 if (!fpw)
8744                 {
8745                         SpinLockAcquire(&xlogctl->info_lck);
8746                         if (XLByteLT(xlogctl->lastFpwDisableRecPtr, ReadRecPtr))
8747                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
8748                         SpinLockRelease(&xlogctl->info_lck);
8749                 }
8750
8751                 /* Keep track of full_page_writes */
8752                 lastFullPageWrites = fpw;
8753         }
8754 }
8755
8756 void
8757 xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
8758 {
8759         uint8           info = xl_info & ~XLR_INFO_MASK;
8760
8761         if (info == XLOG_CHECKPOINT_SHUTDOWN ||
8762                 info == XLOG_CHECKPOINT_ONLINE)
8763         {
8764                 CheckPoint *checkpoint = (CheckPoint *) rec;
8765
8766                 appendStringInfo(buf, "checkpoint: redo %X/%X; "
8767                                                  "tli %u; fpw %s; xid %u/%u; oid %u; multi %u; offset %u; "
8768                                                  "oldest xid %u in DB %u; oldest running xid %u; %s",
8769                                                  checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
8770                                                  checkpoint->ThisTimeLineID,
8771                                                  checkpoint->fullPageWrites ? "true" : "false",
8772                                                  checkpoint->nextXidEpoch, checkpoint->nextXid,
8773                                                  checkpoint->nextOid,
8774                                                  checkpoint->nextMulti,
8775                                                  checkpoint->nextMultiOffset,
8776                                                  checkpoint->oldestXid,
8777                                                  checkpoint->oldestXidDB,
8778                                                  checkpoint->oldestActiveXid,
8779                                  (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
8780         }
8781         else if (info == XLOG_NOOP)
8782         {
8783                 appendStringInfo(buf, "xlog no-op");
8784         }
8785         else if (info == XLOG_NEXTOID)
8786         {
8787                 Oid                     nextOid;
8788
8789                 memcpy(&nextOid, rec, sizeof(Oid));
8790                 appendStringInfo(buf, "nextOid: %u", nextOid);
8791         }
8792         else if (info == XLOG_SWITCH)
8793         {
8794                 appendStringInfo(buf, "xlog switch");
8795         }
8796         else if (info == XLOG_RESTORE_POINT)
8797         {
8798                 xl_restore_point *xlrec = (xl_restore_point *) rec;
8799
8800                 appendStringInfo(buf, "restore point: %s", xlrec->rp_name);
8801
8802         }
8803         else if (info == XLOG_BACKUP_END)
8804         {
8805                 XLogRecPtr      startpoint;
8806
8807                 memcpy(&startpoint, rec, sizeof(XLogRecPtr));
8808                 appendStringInfo(buf, "backup end: %X/%X",
8809                                                  startpoint.xlogid, startpoint.xrecoff);
8810         }
8811         else if (info == XLOG_PARAMETER_CHANGE)
8812         {
8813                 xl_parameter_change xlrec;
8814                 const char *wal_level_str;
8815                 const struct config_enum_entry *entry;
8816
8817                 memcpy(&xlrec, rec, sizeof(xl_parameter_change));
8818
8819                 /* Find a string representation for wal_level */
8820                 wal_level_str = "?";
8821                 for (entry = wal_level_options; entry->name; entry++)
8822                 {
8823                         if (entry->val == xlrec.wal_level)
8824                         {
8825                                 wal_level_str = entry->name;
8826                                 break;
8827                         }
8828                 }
8829
8830                 appendStringInfo(buf, "parameter change: max_connections=%d max_prepared_xacts=%d max_locks_per_xact=%d wal_level=%s",
8831                                                  xlrec.MaxConnections,
8832                                                  xlrec.max_prepared_xacts,
8833                                                  xlrec.max_locks_per_xact,
8834                                                  wal_level_str);
8835         }
8836         else if (info == XLOG_FPW_CHANGE)
8837         {
8838                 bool            fpw;
8839
8840                 memcpy(&fpw, rec, sizeof(bool));
8841                 appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
8842         }
8843         else
8844                 appendStringInfo(buf, "UNKNOWN");
8845 }
8846
8847 #ifdef WAL_DEBUG
8848
8849 static void
8850 xlog_outrec(StringInfo buf, XLogRecord *record)
8851 {
8852         int                     i;
8853
8854         appendStringInfo(buf, "prev %X/%X; xid %u",
8855                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
8856                                          record->xl_xid);
8857
8858         appendStringInfo(buf, "; len %u",
8859                                          record->xl_len);
8860
8861         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
8862         {
8863                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
8864                         appendStringInfo(buf, "; bkpb%d", i + 1);
8865         }
8866
8867         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
8868 }
8869 #endif   /* WAL_DEBUG */
8870
8871
8872 /*
8873  * Return the (possible) sync flag used for opening a file, depending on the
8874  * value of the GUC wal_sync_method.
8875  */
8876 static int
8877 get_sync_bit(int method)
8878 {
8879         int                     o_direct_flag = 0;
8880
8881         /* If fsync is disabled, never open in sync mode */
8882         if (!enableFsync)
8883                 return 0;
8884
8885         /*
8886          * Optimize writes by bypassing kernel cache with O_DIRECT when using
8887          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
8888          * disabled, otherwise the archive command or walsender process will read
8889          * the WAL soon after writing it, which is guaranteed to cause a physical
8890          * read if we bypassed the kernel cache. We also skip the
8891          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
8892          * reason.
8893          *
8894          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
8895          * written by walreceiver is normally read by the startup process soon
8896          * after its written. Also, walreceiver performs unaligned writes, which
8897          * don't work with O_DIRECT, so it is required for correctness too.
8898          */
8899         if (!XLogIsNeeded() && !am_walreceiver)
8900                 o_direct_flag = PG_O_DIRECT;
8901
8902         switch (method)
8903         {
8904                         /*
8905                          * enum values for all sync options are defined even if they are
8906                          * not supported on the current platform.  But if not, they are
8907                          * not included in the enum option array, and therefore will never
8908                          * be seen here.
8909                          */
8910                 case SYNC_METHOD_FSYNC:
8911                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8912                 case SYNC_METHOD_FDATASYNC:
8913                         return 0;
8914 #ifdef OPEN_SYNC_FLAG
8915                 case SYNC_METHOD_OPEN:
8916                         return OPEN_SYNC_FLAG | o_direct_flag;
8917 #endif
8918 #ifdef OPEN_DATASYNC_FLAG
8919                 case SYNC_METHOD_OPEN_DSYNC:
8920                         return OPEN_DATASYNC_FLAG | o_direct_flag;
8921 #endif
8922                 default:
8923                         /* can't happen (unless we are out of sync with option array) */
8924                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
8925                         return 0;                       /* silence warning */
8926         }
8927 }
8928
8929 /*
8930  * GUC support
8931  */
8932 void
8933 assign_xlog_sync_method(int new_sync_method, void *extra)
8934 {
8935         if (sync_method != new_sync_method)
8936         {
8937                 /*
8938                  * To ensure that no blocks escape unsynced, force an fsync on the
8939                  * currently open log segment (if any).  Also, if the open flag is
8940                  * changing, close the log file so it will be reopened (with new flag
8941                  * bit) at next use.
8942                  */
8943                 if (openLogFile >= 0)
8944                 {
8945                         if (pg_fsync(openLogFile) != 0)
8946                                 ereport(PANIC,
8947                                                 (errcode_for_file_access(),
8948                                                  errmsg("could not fsync log file %u, segment %u: %m",
8949                                                                 openLogId, openLogSeg)));
8950                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
8951                                 XLogFileClose();
8952                 }
8953         }
8954 }
8955
8956
8957 /*
8958  * Issue appropriate kind of fsync (if any) for an XLOG output file.
8959  *
8960  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
8961  * 'log' and 'seg' are for error reporting purposes.
8962  */
8963 void
8964 issue_xlog_fsync(int fd, uint32 log, uint32 seg)
8965 {
8966         switch (sync_method)
8967         {
8968                 case SYNC_METHOD_FSYNC:
8969                         if (pg_fsync_no_writethrough(fd) != 0)
8970                                 ereport(PANIC,
8971                                                 (errcode_for_file_access(),
8972                                                  errmsg("could not fsync log file %u, segment %u: %m",
8973                                                                 log, seg)));
8974                         break;
8975 #ifdef HAVE_FSYNC_WRITETHROUGH
8976                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8977                         if (pg_fsync_writethrough(fd) != 0)
8978                                 ereport(PANIC,
8979                                                 (errcode_for_file_access(),
8980                                                  errmsg("could not fsync write-through log file %u, segment %u: %m",
8981                                                                 log, seg)));
8982                         break;
8983 #endif
8984 #ifdef HAVE_FDATASYNC
8985                 case SYNC_METHOD_FDATASYNC:
8986                         if (pg_fdatasync(fd) != 0)
8987                                 ereport(PANIC,
8988                                                 (errcode_for_file_access(),
8989                                         errmsg("could not fdatasync log file %u, segment %u: %m",
8990                                                    log, seg)));
8991                         break;
8992 #endif
8993                 case SYNC_METHOD_OPEN:
8994                 case SYNC_METHOD_OPEN_DSYNC:
8995                         /* write synced it already */
8996                         break;
8997                 default:
8998                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
8999                         break;
9000         }
9001 }
9002
9003 /*
9004  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9005  * function. It creates the necessary starting checkpoint and constructs the
9006  * backup label file.
9007  *
9008  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9009  * backup is started with pg_start_backup(), and there can be only one active
9010  * at a time. The backup label file of an exclusive backup is written to
9011  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9012  *
9013  * A non-exclusive backup is used for the streaming base backups (see
9014  * src/backend/replication/basebackup.c). The difference to exclusive backups
9015  * is that the backup label file is not written to disk. Instead, its would-be
9016  * contents are returned in *labelfile, and the caller is responsible for
9017  * including it in the backup archive as 'backup_label'. There can be many
9018  * non-exclusive backups active at the same time, and they don't conflict
9019  * with an exclusive backup either.
9020  *
9021  * Every successfully started non-exclusive backup must be stopped by calling
9022  * do_pg_stop_backup() or do_pg_abort_backup().
9023  */
9024 XLogRecPtr
9025 do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
9026 {
9027         bool            exclusive = (labelfile == NULL);
9028         bool            backup_started_in_recovery = false;
9029         XLogRecPtr      checkpointloc;
9030         XLogRecPtr      startpoint;
9031         pg_time_t       stamp_time;
9032         char            strfbuf[128];
9033         char            xlogfilename[MAXFNAMELEN];
9034         uint32          _logId;
9035         uint32          _logSeg;
9036         struct stat stat_buf;
9037         FILE       *fp;
9038         StringInfoData labelfbuf;
9039
9040         backup_started_in_recovery = RecoveryInProgress();
9041
9042         if (!superuser() && !is_authenticated_user_replication_role())
9043                 ereport(ERROR,
9044                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
9045                    errmsg("must be superuser or replication role to run a backup")));
9046
9047         /*
9048          * Currently only non-exclusive backup can be taken during recovery.
9049          */
9050         if (backup_started_in_recovery && exclusive)
9051                 ereport(ERROR,
9052                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9053                                  errmsg("recovery is in progress"),
9054                                  errhint("WAL control functions cannot be executed during recovery.")));
9055
9056         /*
9057          * During recovery, we don't need to check WAL level. Because, if WAL level
9058          * is not sufficient, it's impossible to get here during recovery.
9059          */
9060         if (!backup_started_in_recovery && !XLogIsNeeded())
9061                 ereport(ERROR,
9062                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9063                           errmsg("WAL level not sufficient for making an online backup"),
9064                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
9065
9066         if (strlen(backupidstr) > MAXPGPATH)
9067                 ereport(ERROR,
9068                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9069                                  errmsg("backup label too long (max %d bytes)",
9070                                                 MAXPGPATH)));
9071
9072         /*
9073          * Mark backup active in shared memory.  We must do full-page WAL writes
9074          * during an on-line backup even if not doing so at other times, because
9075          * it's quite possible for the backup dump to obtain a "torn" (partially
9076          * written) copy of a database page if it reads the page concurrently with
9077          * our write to the same page.  This can be fixed as long as the first
9078          * write to the page in the WAL sequence is a full-page write. Hence, we
9079          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9080          * are no dirty pages in shared memory that might get dumped while the
9081          * backup is in progress without having a corresponding WAL record.  (Once
9082          * the backup is complete, we need not force full-page writes anymore,
9083          * since we expect that any pages not modified during the backup interval
9084          * must have been correctly captured by the backup.)
9085          *
9086          * Note that forcePageWrites has no effect during an online backup from
9087          * the standby.
9088          *
9089          * We must hold WALInsertLock to change the value of forcePageWrites, to
9090          * ensure adequate interlocking against XLogInsert().
9091          */
9092         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9093         if (exclusive)
9094         {
9095                 if (XLogCtl->Insert.exclusiveBackup)
9096                 {
9097                         LWLockRelease(WALInsertLock);
9098                         ereport(ERROR,
9099                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9100                                          errmsg("a backup is already in progress"),
9101                                          errhint("Run pg_stop_backup() and try again.")));
9102                 }
9103                 XLogCtl->Insert.exclusiveBackup = true;
9104         }
9105         else
9106                 XLogCtl->Insert.nonExclusiveBackups++;
9107         XLogCtl->Insert.forcePageWrites = true;
9108         LWLockRelease(WALInsertLock);
9109
9110         /* Ensure we release forcePageWrites if fail below */
9111         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9112         {
9113                 bool            gotUniqueStartpoint = false;
9114
9115                 /*
9116                  * Force an XLOG file switch before the checkpoint, to ensure that the
9117                  * WAL segment the checkpoint is written to doesn't contain pages with
9118                  * old timeline IDs.  That would otherwise happen if you called
9119                  * pg_start_backup() right after restoring from a PITR archive: the
9120                  * first WAL segment containing the startup checkpoint has pages in
9121                  * the beginning with the old timeline ID.  That can cause trouble at
9122                  * recovery: we won't have a history file covering the old timeline if
9123                  * pg_xlog directory was not included in the base backup and the WAL
9124                  * archive was cleared too before starting the backup.
9125                  *
9126                  * This also ensures that we have emitted a WAL page header that has
9127                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9128                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
9129                  * compress out removable backup blocks, it won't remove any that
9130                  * occur after this point.
9131                  *
9132                  * During recovery, we skip forcing XLOG file switch, which means that
9133                  * the backup taken during recovery is not available for the special
9134                  * recovery case described above.
9135                  */
9136                 if (!backup_started_in_recovery)
9137                         RequestXLogSwitch();
9138
9139                 do
9140                 {
9141                         bool            checkpointfpw;
9142
9143                         /*
9144                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
9145                          * page problems, this guarantees that two successive backup runs
9146                          * will have different checkpoint positions and hence different
9147                          * history file names, even if nothing happened in between.
9148                          *
9149                          * During recovery, establish a restartpoint if possible. We use the last
9150                          * restartpoint as the backup starting checkpoint. This means that two
9151                          * successive backup runs can have same checkpoint positions.
9152                          *
9153                          * Since the fact that we are executing do_pg_start_backup() during
9154                          * recovery means that checkpointer is running, we can use
9155                          * RequestCheckpoint() to establish a restartpoint.
9156                          *
9157                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
9158                          * passing fast = true).  Otherwise this can take awhile.
9159                          */
9160                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9161                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
9162
9163                         /*
9164                          * Now we need to fetch the checkpoint record location, and also
9165                          * its REDO pointer.  The oldest point in WAL that would be needed
9166                          * to restore starting from the checkpoint is precisely the REDO
9167                          * pointer.
9168                          */
9169                         LWLockAcquire(ControlFileLock, LW_SHARED);
9170                         checkpointloc = ControlFile->checkPoint;
9171                         startpoint = ControlFile->checkPointCopy.redo;
9172                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9173                         LWLockRelease(ControlFileLock);
9174
9175                         if (backup_started_in_recovery)
9176                         {
9177                                 /* use volatile pointer to prevent code rearrangement */
9178                                 volatile XLogCtlData *xlogctl = XLogCtl;
9179                                 XLogRecPtr              recptr;
9180
9181                                 /*
9182                                  * Check to see if all WAL replayed during online backup (i.e.,
9183                                  * since last restartpoint used as backup starting checkpoint)
9184                                  * contain full-page writes.
9185                                  */
9186                                 SpinLockAcquire(&xlogctl->info_lck);
9187                                 recptr = xlogctl->lastFpwDisableRecPtr;
9188                                 SpinLockRelease(&xlogctl->info_lck);
9189
9190                                 if (!checkpointfpw || XLByteLE(startpoint, recptr))
9191                                         ereport(ERROR,
9192                                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9193                                                          errmsg("WAL generated with full_page_writes=off was replayed "
9194                                                                         "since last restartpoint"),
9195                                                          errhint("This means that the backup being taken on standby "
9196                                                                          "is corrupt and should not be used. "
9197                                                                          "Enable full_page_writes and run CHECKPOINT on the master, "
9198                                                                          "and then try an online backup again.")));
9199
9200                                 /*
9201                                  * During recovery, since we don't use the end-of-backup WAL
9202                                  * record and don't write the backup history file, the starting WAL
9203                                  * location doesn't need to be unique. This means that two base
9204                                  * backups started at the same time might use the same checkpoint
9205                                  * as starting locations.
9206                                  */
9207                                 gotUniqueStartpoint = true;
9208                         }
9209
9210                         /*
9211                          * If two base backups are started at the same time (in WAL sender
9212                          * processes), we need to make sure that they use different
9213                          * checkpoints as starting locations, because we use the starting
9214                          * WAL location as a unique identifier for the base backup in the
9215                          * end-of-backup WAL record and when we write the backup history
9216                          * file. Perhaps it would be better generate a separate unique ID
9217                          * for each backup instead of forcing another checkpoint, but
9218                          * taking a checkpoint right after another is not that expensive
9219                          * either because only few buffers have been dirtied yet.
9220                          */
9221                         LWLockAcquire(WALInsertLock, LW_SHARED);
9222                         if (XLByteLT(XLogCtl->Insert.lastBackupStart, startpoint))
9223                         {
9224                                 XLogCtl->Insert.lastBackupStart = startpoint;
9225                                 gotUniqueStartpoint = true;
9226                         }
9227                         LWLockRelease(WALInsertLock);
9228                 } while (!gotUniqueStartpoint);
9229
9230                 XLByteToSeg(startpoint, _logId, _logSeg);
9231                 XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
9232
9233                 /*
9234                  * Construct backup label file
9235                  */
9236                 initStringInfo(&labelfbuf);
9237
9238                 /* Use the log timezone here, not the session timezone */
9239                 stamp_time = (pg_time_t) time(NULL);
9240                 pg_strftime(strfbuf, sizeof(strfbuf),
9241                                         "%Y-%m-%d %H:%M:%S %Z",
9242                                         pg_localtime(&stamp_time, log_timezone));
9243                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
9244                                                  startpoint.xlogid, startpoint.xrecoff, xlogfilename);
9245                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
9246                                                  checkpointloc.xlogid, checkpointloc.xrecoff);
9247                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
9248                                                  exclusive ? "pg_start_backup" : "streamed");
9249                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
9250                                                  backup_started_in_recovery ? "standby" : "master");
9251                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
9252                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
9253
9254                 /*
9255                  * Okay, write the file, or return its contents to caller.
9256                  */
9257                 if (exclusive)
9258                 {
9259                         /*
9260                          * Check for existing backup label --- implies a backup is already
9261                          * running.  (XXX given that we checked exclusiveBackup above,
9262                          * maybe it would be OK to just unlink any such label file?)
9263                          */
9264                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
9265                         {
9266                                 if (errno != ENOENT)
9267                                         ereport(ERROR,
9268                                                         (errcode_for_file_access(),
9269                                                          errmsg("could not stat file \"%s\": %m",
9270                                                                         BACKUP_LABEL_FILE)));
9271                         }
9272                         else
9273                                 ereport(ERROR,
9274                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9275                                                  errmsg("a backup is already in progress"),
9276                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
9277                                                                  BACKUP_LABEL_FILE)));
9278
9279                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
9280
9281                         if (!fp)
9282                                 ereport(ERROR,
9283                                                 (errcode_for_file_access(),
9284                                                  errmsg("could not create file \"%s\": %m",
9285                                                                 BACKUP_LABEL_FILE)));
9286                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
9287                                 fflush(fp) != 0 ||
9288                                 ferror(fp) ||
9289                                 FreeFile(fp))
9290                                 ereport(ERROR,
9291                                                 (errcode_for_file_access(),
9292                                                  errmsg("could not write file \"%s\": %m",
9293                                                                 BACKUP_LABEL_FILE)));
9294                         pfree(labelfbuf.data);
9295                 }
9296                 else
9297                         *labelfile = labelfbuf.data;
9298         }
9299         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9300
9301         /*
9302          * We're done.  As a convenience, return the starting WAL location.
9303          */
9304         return startpoint;
9305 }
9306
9307 /* Error cleanup callback for pg_start_backup */
9308 static void
9309 pg_start_backup_callback(int code, Datum arg)
9310 {
9311         bool            exclusive = DatumGetBool(arg);
9312
9313         /* Update backup counters and forcePageWrites on failure */
9314         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9315         if (exclusive)
9316         {
9317                 Assert(XLogCtl->Insert.exclusiveBackup);
9318                 XLogCtl->Insert.exclusiveBackup = false;
9319         }
9320         else
9321         {
9322                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9323                 XLogCtl->Insert.nonExclusiveBackups--;
9324         }
9325
9326         if (!XLogCtl->Insert.exclusiveBackup &&
9327                 XLogCtl->Insert.nonExclusiveBackups == 0)
9328         {
9329                 XLogCtl->Insert.forcePageWrites = false;
9330         }
9331         LWLockRelease(WALInsertLock);
9332 }
9333
9334 /*
9335  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
9336  * function.
9337
9338  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
9339  * the non-exclusive backup specified by 'labelfile'.
9340  */
9341 XLogRecPtr
9342 do_pg_stop_backup(char *labelfile, bool waitforarchive)
9343 {
9344         bool            exclusive = (labelfile == NULL);
9345         bool            backup_started_in_recovery = false;
9346         XLogRecPtr      startpoint;
9347         XLogRecPtr      stoppoint;
9348         XLogRecData rdata;
9349         pg_time_t       stamp_time;
9350         char            strfbuf[128];
9351         char            histfilepath[MAXPGPATH];
9352         char            startxlogfilename[MAXFNAMELEN];
9353         char            stopxlogfilename[MAXFNAMELEN];
9354         char            lastxlogfilename[MAXFNAMELEN];
9355         char            histfilename[MAXFNAMELEN];
9356         char            backupfrom[20];
9357         uint32          _logId;
9358         uint32          _logSeg;
9359         FILE       *lfp;
9360         FILE       *fp;
9361         char            ch;
9362         int                     seconds_before_warning;
9363         int                     waits = 0;
9364         bool            reported_waiting = false;
9365         char       *remaining;
9366         char       *ptr;
9367
9368         backup_started_in_recovery = RecoveryInProgress();
9369
9370         if (!superuser() && !is_authenticated_user_replication_role())
9371                 ereport(ERROR,
9372                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
9373                  (errmsg("must be superuser or replication role to run a backup"))));
9374
9375         /*
9376          * Currently only non-exclusive backup can be taken during recovery.
9377          */
9378         if (backup_started_in_recovery && exclusive)
9379                 ereport(ERROR,
9380                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9381                                  errmsg("recovery is in progress"),
9382                                  errhint("WAL control functions cannot be executed during recovery.")));
9383
9384         /*
9385          * During recovery, we don't need to check WAL level. Because, if WAL level
9386          * is not sufficient, it's impossible to get here during recovery.
9387          */
9388         if (!backup_started_in_recovery && !XLogIsNeeded())
9389                 ereport(ERROR,
9390                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9391                           errmsg("WAL level not sufficient for making an online backup"),
9392                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
9393
9394         /*
9395          * OK to update backup counters and forcePageWrites
9396          */
9397         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9398         if (exclusive)
9399                 XLogCtl->Insert.exclusiveBackup = false;
9400         else
9401         {
9402                 /*
9403                  * The user-visible pg_start/stop_backup() functions that operate on
9404                  * exclusive backups can be called at any time, but for non-exclusive
9405                  * backups, it is expected that each do_pg_start_backup() call is
9406                  * matched by exactly one do_pg_stop_backup() call.
9407                  */
9408                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9409                 XLogCtl->Insert.nonExclusiveBackups--;
9410         }
9411
9412         if (!XLogCtl->Insert.exclusiveBackup &&
9413                 XLogCtl->Insert.nonExclusiveBackups == 0)
9414         {
9415                 XLogCtl->Insert.forcePageWrites = false;
9416         }
9417         LWLockRelease(WALInsertLock);
9418
9419         if (exclusive)
9420         {
9421                 /*
9422                  * Read the existing label file into memory.
9423                  */
9424                 struct stat statbuf;
9425                 int                     r;
9426
9427                 if (stat(BACKUP_LABEL_FILE, &statbuf))
9428                 {
9429                         if (errno != ENOENT)
9430                                 ereport(ERROR,
9431                                                 (errcode_for_file_access(),
9432                                                  errmsg("could not stat file \"%s\": %m",
9433                                                                 BACKUP_LABEL_FILE)));
9434                         ereport(ERROR,
9435                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9436                                          errmsg("a backup is not in progress")));
9437                 }
9438
9439                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9440                 if (!lfp)
9441                 {
9442                         ereport(ERROR,
9443                                         (errcode_for_file_access(),
9444                                          errmsg("could not read file \"%s\": %m",
9445                                                         BACKUP_LABEL_FILE)));
9446                 }
9447                 labelfile = palloc(statbuf.st_size + 1);
9448                 r = fread(labelfile, statbuf.st_size, 1, lfp);
9449                 labelfile[statbuf.st_size] = '\0';
9450
9451                 /*
9452                  * Close and remove the backup label file
9453                  */
9454                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
9455                         ereport(ERROR,
9456                                         (errcode_for_file_access(),
9457                                          errmsg("could not read file \"%s\": %m",
9458                                                         BACKUP_LABEL_FILE)));
9459                 if (unlink(BACKUP_LABEL_FILE) != 0)
9460                         ereport(ERROR,
9461                                         (errcode_for_file_access(),
9462                                          errmsg("could not remove file \"%s\": %m",
9463                                                         BACKUP_LABEL_FILE)));
9464         }
9465
9466         /*
9467          * Read and parse the START WAL LOCATION line (this code is pretty crude,
9468          * but we are not expecting any variability in the file format).
9469          */
9470         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
9471                            &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
9472                            &ch) != 4 || ch != '\n')
9473                 ereport(ERROR,
9474                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9475                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9476         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
9477
9478         /*
9479          * Parse the BACKUP FROM line. If we are taking an online backup from
9480          * the standby, we confirm that the standby has not been promoted
9481          * during the backup.
9482          */
9483         ptr = strstr(remaining, "BACKUP FROM:");
9484         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
9485                 ereport(ERROR,
9486                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9487                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9488         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
9489                 ereport(ERROR,
9490                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9491                                  errmsg("the standby was promoted during online backup"),
9492                                  errhint("This means that the backup being taken is corrupt "
9493                                                  "and should not be used. "
9494                                                  "Try taking another online backup.")));
9495
9496         /*
9497          * During recovery, we don't write an end-of-backup record. We assume
9498          * that pg_control was backed up last and its minimum recovery
9499          * point can be available as the backup end location. Since we don't
9500          * have an end-of-backup record, we use the pg_control value to check
9501          * whether we've reached the end of backup when starting recovery from
9502          * this backup. We have no way of checking if pg_control wasn't backed
9503          * up last however.
9504          *
9505          * We don't force a switch to new WAL file and wait for all the required
9506          * files to be archived. This is okay if we use the backup to start
9507          * the standby. But, if it's for an archive recovery, to ensure all the
9508          * required files are available, a user should wait for them to be archived,
9509          * or include them into the backup.
9510          *
9511          * We return the current minimum recovery point as the backup end
9512          * location. Note that it's would be bigger than the exact backup end
9513          * location if the minimum recovery point is updated since the backup
9514          * of pg_control. This is harmless for current uses.
9515          *
9516          * XXX currently a backup history file is for informational and debug
9517          * purposes only. It's not essential for an online backup. Furthermore,
9518          * even if it's created, it will not be archived during recovery because
9519          * an archiver is not invoked. So it doesn't seem worthwhile to write
9520          * a backup history file during recovery.
9521          */
9522         if (backup_started_in_recovery)
9523         {
9524                 /* use volatile pointer to prevent code rearrangement */
9525                 volatile XLogCtlData *xlogctl = XLogCtl;
9526                 XLogRecPtr      recptr;
9527
9528                 /*
9529                  * Check to see if all WAL replayed during online backup contain
9530                  * full-page writes.
9531                  */
9532                 SpinLockAcquire(&xlogctl->info_lck);
9533                 recptr = xlogctl->lastFpwDisableRecPtr;
9534                 SpinLockRelease(&xlogctl->info_lck);
9535
9536                 if (XLByteLE(startpoint, recptr))
9537                         ereport(ERROR,
9538                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9539                                          errmsg("WAL generated with full_page_writes=off was replayed "
9540                                                         "during online backup"),
9541                                          errhint("This means that the backup being taken on standby "
9542                                                          "is corrupt and should not be used. "
9543                                                          "Enable full_page_writes and run CHECKPOINT on the master, "
9544                                                          "and then try an online backup again.")));
9545
9546
9547                 LWLockAcquire(ControlFileLock, LW_SHARED);
9548                 stoppoint = ControlFile->minRecoveryPoint;
9549                 LWLockRelease(ControlFileLock);
9550
9551                 return stoppoint;
9552         }
9553
9554         /*
9555          * Write the backup-end xlog record
9556          */
9557         rdata.data = (char *) (&startpoint);
9558         rdata.len = sizeof(startpoint);
9559         rdata.buffer = InvalidBuffer;
9560         rdata.next = NULL;
9561         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
9562
9563         /*
9564          * Force a switch to a new xlog segment file, so that the backup is valid
9565          * as soon as archiver moves out the current segment file.
9566          */
9567         RequestXLogSwitch();
9568
9569         XLByteToPrevSeg(stoppoint, _logId, _logSeg);
9570         XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
9571
9572         /* Use the log timezone here, not the session timezone */
9573         stamp_time = (pg_time_t) time(NULL);
9574         pg_strftime(strfbuf, sizeof(strfbuf),
9575                                 "%Y-%m-%d %H:%M:%S %Z",
9576                                 pg_localtime(&stamp_time, log_timezone));
9577
9578         /*
9579          * Write the backup history file
9580          */
9581         XLByteToSeg(startpoint, _logId, _logSeg);
9582         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
9583                                                   startpoint.xrecoff % XLogSegSize);
9584         fp = AllocateFile(histfilepath, "w");
9585         if (!fp)
9586                 ereport(ERROR,
9587                                 (errcode_for_file_access(),
9588                                  errmsg("could not create file \"%s\": %m",
9589                                                 histfilepath)));
9590         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
9591                         startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
9592         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
9593                         stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
9594         /* transfer remaining lines from label to history file */
9595         fprintf(fp, "%s", remaining);
9596         fprintf(fp, "STOP TIME: %s\n", strfbuf);
9597         if (fflush(fp) || ferror(fp) || FreeFile(fp))
9598                 ereport(ERROR,
9599                                 (errcode_for_file_access(),
9600                                  errmsg("could not write file \"%s\": %m",
9601                                                 histfilepath)));
9602
9603         /*
9604          * Clean out any no-longer-needed history files.  As a side effect, this
9605          * will post a .ready file for the newly created history file, notifying
9606          * the archiver that history file may be archived immediately.
9607          */
9608         CleanupBackupHistory();
9609
9610         /*
9611          * If archiving is enabled, wait for all the required WAL files to be
9612          * archived before returning. If archiving isn't enabled, the required WAL
9613          * needs to be transported via streaming replication (hopefully with
9614          * wal_keep_segments set high enough), or some more exotic mechanism like
9615          * polling and copying files from pg_xlog with script. We have no
9616          * knowledge of those mechanisms, so it's up to the user to ensure that he
9617          * gets all the required WAL.
9618          *
9619          * We wait until both the last WAL file filled during backup and the
9620          * history file have been archived, and assume that the alphabetic sorting
9621          * property of the WAL files ensures any earlier WAL files are safely
9622          * archived as well.
9623          *
9624          * We wait forever, since archive_command is supposed to work and we
9625          * assume the admin wanted his backup to work completely. If you don't
9626          * wish to wait, you can set statement_timeout.  Also, some notices are
9627          * issued to clue in anyone who might be doing this interactively.
9628          */
9629         if (waitforarchive && XLogArchivingActive())
9630         {
9631                 XLByteToPrevSeg(stoppoint, _logId, _logSeg);
9632                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
9633
9634                 XLByteToSeg(startpoint, _logId, _logSeg);
9635                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
9636                                                           startpoint.xrecoff % XLogSegSize);
9637
9638                 seconds_before_warning = 60;
9639                 waits = 0;
9640
9641                 while (XLogArchiveIsBusy(lastxlogfilename) ||
9642                            XLogArchiveIsBusy(histfilename))
9643                 {
9644                         CHECK_FOR_INTERRUPTS();
9645
9646                         if (!reported_waiting && waits > 5)
9647                         {
9648                                 ereport(NOTICE,
9649                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
9650                                 reported_waiting = true;
9651                         }
9652
9653                         pg_usleep(1000000L);
9654
9655                         if (++waits >= seconds_before_warning)
9656                         {
9657                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
9658                                 ereport(WARNING,
9659                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
9660                                                                 waits),
9661                                                  errhint("Check that your archive_command is executing properly.  "
9662                                                                  "pg_stop_backup can be canceled safely, "
9663                                                                  "but the database backup will not be usable without all the WAL segments.")));
9664                         }
9665                 }
9666
9667                 ereport(NOTICE,
9668                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
9669         }
9670         else if (waitforarchive)
9671                 ereport(NOTICE,
9672                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
9673
9674         /*
9675          * We're done.  As a convenience, return the ending WAL location.
9676          */
9677         return stoppoint;
9678 }
9679
9680
9681 /*
9682  * do_pg_abort_backup: abort a running backup
9683  *
9684  * This does just the most basic steps of do_pg_stop_backup(), by taking the
9685  * system out of backup mode, thus making it a lot more safe to call from
9686  * an error handler.
9687  *
9688  * NB: This is only for aborting a non-exclusive backup that doesn't write
9689  * backup_label. A backup started with pg_stop_backup() needs to be finished
9690  * with pg_stop_backup().
9691  */
9692 void
9693 do_pg_abort_backup(void)
9694 {
9695         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9696         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9697         XLogCtl->Insert.nonExclusiveBackups--;
9698
9699         if (!XLogCtl->Insert.exclusiveBackup &&
9700                 XLogCtl->Insert.nonExclusiveBackups == 0)
9701         {
9702                 XLogCtl->Insert.forcePageWrites = false;
9703         }
9704         LWLockRelease(WALInsertLock);
9705 }
9706
9707 /*
9708  * Get latest redo apply position.
9709  *
9710  * Optionally, returns the end byte position of the last restored
9711  * WAL segment. Callers not interested in that value may pass
9712  * NULL for restoreLastRecPtr.
9713  *
9714  * Exported to allow WALReceiver to read the pointer directly.
9715  */
9716 XLogRecPtr
9717 GetXLogReplayRecPtr(XLogRecPtr *restoreLastRecPtr)
9718 {
9719         /* use volatile pointer to prevent code rearrangement */
9720         volatile XLogCtlData *xlogctl = XLogCtl;
9721         XLogRecPtr      recptr;
9722
9723         SpinLockAcquire(&xlogctl->info_lck);
9724         recptr = xlogctl->recoveryLastRecPtr;
9725         if (restoreLastRecPtr)
9726                 *restoreLastRecPtr = xlogctl->restoreLastRecPtr;
9727         SpinLockRelease(&xlogctl->info_lck);
9728
9729         return recptr;
9730 }
9731
9732 /*
9733  * Get current standby flush position, ie, the last WAL position
9734  * known to be fsync'd to disk in standby.
9735  */
9736 XLogRecPtr
9737 GetStandbyFlushRecPtr(void)
9738 {
9739         XLogRecPtr      receivePtr;
9740         XLogRecPtr      replayPtr;
9741         XLogRecPtr      restorePtr;
9742
9743         receivePtr = GetWalRcvWriteRecPtr(NULL);
9744         replayPtr = GetXLogReplayRecPtr(&restorePtr);
9745
9746         if (XLByteLT(receivePtr, replayPtr))
9747                 return XLByteLT(replayPtr, restorePtr) ? restorePtr : replayPtr;
9748         else
9749                 return XLByteLT(receivePtr, restorePtr) ? restorePtr : receivePtr;
9750 }
9751
9752 /*
9753  * Get latest WAL insert pointer
9754  */
9755 XLogRecPtr
9756 GetXLogInsertRecPtr(void)
9757 {
9758         XLogCtlInsert *Insert = &XLogCtl->Insert;
9759         XLogRecPtr      current_recptr;
9760
9761         LWLockAcquire(WALInsertLock, LW_SHARED);
9762         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
9763         LWLockRelease(WALInsertLock);
9764
9765         return current_recptr;
9766 }
9767
9768 /*
9769  * Get latest WAL write pointer
9770  */
9771 XLogRecPtr
9772 GetXLogWriteRecPtr(void)
9773 {
9774         {
9775                 /* use volatile pointer to prevent code rearrangement */
9776                 volatile XLogCtlData *xlogctl = XLogCtl;
9777
9778                 SpinLockAcquire(&xlogctl->info_lck);
9779                 LogwrtResult = xlogctl->LogwrtResult;
9780                 SpinLockRelease(&xlogctl->info_lck);
9781         }
9782
9783         return LogwrtResult.Write;
9784 }
9785
9786 /*
9787  * read_backup_label: check to see if a backup_label file is present
9788  *
9789  * If we see a backup_label during recovery, we assume that we are recovering
9790  * from a backup dump file, and we therefore roll forward from the checkpoint
9791  * identified by the label file, NOT what pg_control says.      This avoids the
9792  * problem that pg_control might have been archived one or more checkpoints
9793  * later than the start of the dump, and so if we rely on it as the start
9794  * point, we will fail to restore a consistent database state.
9795  *
9796  * Returns TRUE if a backup_label was found (and fills the checkpoint
9797  * location and its REDO location into *checkPointLoc and RedoStartLSN,
9798  * respectively); returns FALSE if not. If this backup_label came from a
9799  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
9800  * was created during recovery, *backupFromStandby is set to TRUE.
9801  */
9802 static bool
9803 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
9804                                   bool *backupFromStandby)
9805 {
9806         char            startxlogfilename[MAXFNAMELEN];
9807         TimeLineID      tli;
9808         FILE       *lfp;
9809         char            ch;
9810         char            backuptype[20];
9811         char            backupfrom[20];
9812
9813         *backupEndRequired = false;
9814         *backupFromStandby = false;
9815
9816         /*
9817          * See if label file is present
9818          */
9819         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9820         if (!lfp)
9821         {
9822                 if (errno != ENOENT)
9823                         ereport(FATAL,
9824                                         (errcode_for_file_access(),
9825                                          errmsg("could not read file \"%s\": %m",
9826                                                         BACKUP_LABEL_FILE)));
9827                 return false;                   /* it's not there, all is fine */
9828         }
9829
9830         /*
9831          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
9832          * is pretty crude, but we are not expecting any variability in the file
9833          * format).
9834          */
9835         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
9836                            &RedoStartLSN.xlogid, &RedoStartLSN.xrecoff, &tli,
9837                            startxlogfilename, &ch) != 5 || ch != '\n')
9838                 ereport(FATAL,
9839                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9840                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9841         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
9842                            &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
9843                            &ch) != 3 || ch != '\n')
9844                 ereport(FATAL,
9845                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9846                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9847         /*
9848          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't
9849          * restore from an older backup anyway, but since the information on it
9850          * is not strictly required, don't error out if it's missing for some reason.
9851          */
9852         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
9853         {
9854                 if (strcmp(backuptype, "streamed") == 0)
9855                         *backupEndRequired = true;
9856         }
9857
9858         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
9859         {
9860                 if (strcmp(backupfrom, "standby") == 0)
9861                         *backupFromStandby = true;
9862         }
9863
9864         if (ferror(lfp) || FreeFile(lfp))
9865                 ereport(FATAL,
9866                                 (errcode_for_file_access(),
9867                                  errmsg("could not read file \"%s\": %m",
9868                                                 BACKUP_LABEL_FILE)));
9869
9870         return true;
9871 }
9872
9873 /*
9874  * Error context callback for errors occurring during rm_redo().
9875  */
9876 static void
9877 rm_redo_error_callback(void *arg)
9878 {
9879         XLogRecord *record = (XLogRecord *) arg;
9880         StringInfoData buf;
9881
9882         initStringInfo(&buf);
9883         RmgrTable[record->xl_rmid].rm_desc(&buf,
9884                                                                            record->xl_info,
9885                                                                            XLogRecGetData(record));
9886
9887         /* don't bother emitting empty description */
9888         if (buf.len > 0)
9889                 errcontext("xlog redo %s", buf.data);
9890
9891         pfree(buf.data);
9892 }
9893
9894 /*
9895  * BackupInProgress: check if online backup mode is active
9896  *
9897  * This is done by checking for existence of the "backup_label" file.
9898  */
9899 bool
9900 BackupInProgress(void)
9901 {
9902         struct stat stat_buf;
9903
9904         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
9905 }
9906
9907 /*
9908  * CancelBackup: rename the "backup_label" file to cancel backup mode
9909  *
9910  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
9911  * Note that this will render an online backup in progress useless.
9912  * To correctly finish an online backup, pg_stop_backup must be called.
9913  */
9914 void
9915 CancelBackup(void)
9916 {
9917         struct stat stat_buf;
9918
9919         /* if the file is not there, return */
9920         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
9921                 return;
9922
9923         /* remove leftover file from previously canceled backup if it exists */
9924         unlink(BACKUP_LABEL_OLD);
9925
9926         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
9927         {
9928                 ereport(LOG,
9929                                 (errmsg("online backup mode canceled"),
9930                                  errdetail("\"%s\" was renamed to \"%s\".",
9931                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9932         }
9933         else
9934         {
9935                 ereport(WARNING,
9936                                 (errcode_for_file_access(),
9937                                  errmsg("online backup mode was not canceled"),
9938                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
9939                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9940         }
9941 }
9942
9943 /*
9944  * Read the XLOG page containing RecPtr into readBuf (if not read already).
9945  * Returns true if the page is read successfully.
9946  *
9947  * This is responsible for restoring files from archive as needed, as well
9948  * as for waiting for the requested WAL record to arrive in standby mode.
9949  *
9950  * 'emode' specifies the log level used for reporting "file not found" or
9951  * "end of WAL" situations in archive recovery, or in standby mode when a
9952  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
9953  * false in those situations, on higher log levels the ereport() won't
9954  * return.
9955  *
9956  * In standby mode, if after a successful return of XLogPageRead() the
9957  * caller finds the record it's interested in to be broken, it should
9958  * ereport the error with the level determined by
9959  * emode_for_corrupt_record(), and then set "failedSources |= readSource"
9960  * and call XLogPageRead() again with the same arguments. This lets
9961  * XLogPageRead() to try fetching the record from another source, or to
9962  * sleep and retry.
9963  */
9964 static bool
9965 XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
9966                          bool randAccess)
9967 {
9968         static XLogRecPtr receivedUpto = {0, 0};
9969         bool            switched_segment = false;
9970         uint32          targetPageOff;
9971         uint32          targetRecOff;
9972         uint32          targetId;
9973         uint32          targetSeg;
9974         static pg_time_t last_fail_time = 0;
9975
9976         XLByteToSeg(*RecPtr, targetId, targetSeg);
9977         targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
9978         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
9979
9980         /* Fast exit if we have read the record in the current buffer already */
9981         if (failedSources == 0 && targetId == readId && targetSeg == readSeg &&
9982                 targetPageOff == readOff && targetRecOff < readLen)
9983                 return true;
9984
9985         /*
9986          * See if we need to switch to a new segment because the requested record
9987          * is not in the currently open one.
9988          */
9989         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
9990         {
9991                 /*
9992                  * Request a restartpoint if we've replayed too much
9993                  * xlog since the last one.
9994                  */
9995                 if (StandbyMode && bgwriterLaunched)
9996                 {
9997                         if (XLogCheckpointNeeded(readId, readSeg))
9998                         {
9999                                 (void) GetRedoRecPtr();
10000                                 if (XLogCheckpointNeeded(readId, readSeg))
10001                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10002                         }
10003                 }
10004
10005                 close(readFile);
10006                 readFile = -1;
10007                 readSource = 0;
10008         }
10009
10010         XLByteToSeg(*RecPtr, readId, readSeg);
10011
10012 retry:
10013         /* See if we need to retrieve more data */
10014         if (readFile < 0 ||
10015                 (readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto)))
10016         {
10017                 if (StandbyMode)
10018                 {
10019                         /*
10020                          * In standby mode, wait for the requested record to become
10021                          * available, either via restore_command succeeding to restore the
10022                          * segment, or via walreceiver having streamed the record.
10023                          */
10024                         for (;;)
10025                         {
10026                                 if (WalRcvInProgress())
10027                                 {
10028                                         bool            havedata;
10029
10030                                         /*
10031                                          * If we find an invalid record in the WAL streamed from
10032                                          * master, something is seriously wrong. There's little
10033                                          * chance that the problem will just go away, but PANIC is
10034                                          * not good for availability either, especially in hot
10035                                          * standby mode. Disconnect, and retry from
10036                                          * archive/pg_xlog again. The WAL in the archive should be
10037                                          * identical to what was streamed, so it's unlikely that
10038                                          * it helps, but one can hope...
10039                                          */
10040                                         if (failedSources & XLOG_FROM_STREAM)
10041                                         {
10042                                                 ShutdownWalRcv();
10043                                                 continue;
10044                                         }
10045
10046                                         /*
10047                                          * Walreceiver is active, so see if new data has arrived.
10048                                          *
10049                                          * We only advance XLogReceiptTime when we obtain fresh
10050                                          * WAL from walreceiver and observe that we had already
10051                                          * processed everything before the most recent "chunk"
10052                                          * that it flushed to disk.  In steady state where we are
10053                                          * keeping up with the incoming data, XLogReceiptTime will
10054                                          * be updated on each cycle.  When we are behind,
10055                                          * XLogReceiptTime will not advance, so the grace time
10056                                          * alloted to conflicting queries will decrease.
10057                                          */
10058                                         if (XLByteLT(*RecPtr, receivedUpto))
10059                                                 havedata = true;
10060                                         else
10061                                         {
10062                                                 XLogRecPtr      latestChunkStart;
10063
10064                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
10065                                                 if (XLByteLT(*RecPtr, receivedUpto))
10066                                                 {
10067                                                         havedata = true;
10068                                                         if (!XLByteLT(*RecPtr, latestChunkStart))
10069                                                         {
10070                                                                 XLogReceiptTime = GetCurrentTimestamp();
10071                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
10072                                                         }
10073                                                 }
10074                                                 else
10075                                                         havedata = false;
10076                                         }
10077                                         if (havedata)
10078                                         {
10079                                                 /*
10080                                                  * Great, streamed far enough. Open the file if it's
10081                                                  * not open already.  Use XLOG_FROM_STREAM so that
10082                                                  * source info is set correctly and XLogReceiptTime
10083                                                  * isn't changed.
10084                                                  */
10085                                                 if (readFile < 0)
10086                                                 {
10087                                                         readFile =
10088                                                                 XLogFileRead(readId, readSeg, PANIC,
10089                                                                                          recoveryTargetTLI,
10090                                                                                          XLOG_FROM_STREAM, false);
10091                                                         Assert(readFile >= 0);
10092                                                         switched_segment = true;
10093                                                 }
10094                                                 else
10095                                                 {
10096                                                         /* just make sure source info is correct... */
10097                                                         readSource = XLOG_FROM_STREAM;
10098                                                         XLogReceiptSource = XLOG_FROM_STREAM;
10099                                                 }
10100                                                 break;
10101                                         }
10102
10103                                         /*
10104                                          * Data not here yet, so check for trigger then sleep for
10105                                          * five seconds like in the WAL file polling case below.
10106                                          */
10107                                         if (CheckForStandbyTrigger())
10108                                                 goto retry;
10109
10110                                         /*
10111                                          * Wait for more WAL to arrive, or timeout to be reached
10112                                          */
10113                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
10114                                                           WL_LATCH_SET | WL_TIMEOUT,
10115                                                           5000L);
10116                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
10117                                 }
10118                                 else
10119                                 {
10120                                         int                     sources;
10121                                         pg_time_t       now;
10122
10123                                         /*
10124                                          * Until walreceiver manages to reconnect, poll the
10125                                          * archive.
10126                                          */
10127                                         if (readFile >= 0)
10128                                         {
10129                                                 close(readFile);
10130                                                 readFile = -1;
10131                                         }
10132                                         /* Reset curFileTLI if random fetch. */
10133                                         if (randAccess)
10134                                                 curFileTLI = 0;
10135
10136                                         /*
10137                                          * Try to restore the file from archive, or read an
10138                                          * existing file from pg_xlog.
10139                                          */
10140                                         sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
10141                                         if (!(sources & ~failedSources))
10142                                         {
10143                                                 /*
10144                                                  * We've exhausted all options for retrieving the
10145                                                  * file. Retry.
10146                                                  */
10147                                                 failedSources = 0;
10148
10149                                                 /*
10150                                                  * Before we sleep, re-scan for possible new timelines
10151                                                  * if we were requested to recover to the latest
10152                                                  * timeline.
10153                                                  */
10154                                                 if (recoveryTargetIsLatest)
10155                                                 {
10156                                                         if (rescanLatestTimeLine())
10157                                                                 continue;
10158                                                 }
10159
10160                                                 /*
10161                                                  * If it hasn't been long since last attempt, sleep to
10162                                                  * avoid busy-waiting.
10163                                                  */
10164                                                 now = (pg_time_t) time(NULL);
10165                                                 if ((now - last_fail_time) < 5)
10166                                                 {
10167                                                         pg_usleep(1000000L * (5 - (now - last_fail_time)));
10168                                                         now = (pg_time_t) time(NULL);
10169                                                 }
10170                                                 last_fail_time = now;
10171
10172                                                 /*
10173                                                  * If primary_conninfo is set, launch walreceiver to
10174                                                  * try to stream the missing WAL, before retrying to
10175                                                  * restore from archive/pg_xlog.
10176                                                  *
10177                                                  * If fetching_ckpt is TRUE, RecPtr points to the
10178                                                  * initial checkpoint location. In that case, we use
10179                                                  * RedoStartLSN as the streaming start position
10180                                                  * instead of RecPtr, so that when we later jump
10181                                                  * backwards to start redo at RedoStartLSN, we will
10182                                                  * have the logs streamed already.
10183                                                  */
10184                                                 if (PrimaryConnInfo)
10185                                                 {
10186                                                         RequestXLogStreaming(
10187                                                                           fetching_ckpt ? RedoStartLSN : *RecPtr,
10188                                                                                                  PrimaryConnInfo);
10189                                                         continue;
10190                                                 }
10191                                         }
10192                                         /* Don't try to read from a source that just failed */
10193                                         sources &= ~failedSources;
10194                                         readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,
10195                                                                                                   sources);
10196                                         switched_segment = true;
10197                                         if (readFile >= 0)
10198                                                 break;
10199
10200                                         /*
10201                                          * Nope, not found in archive and/or pg_xlog.
10202                                          */
10203                                         failedSources |= sources;
10204
10205                                         /*
10206                                          * Check to see if the trigger file exists. Note that we
10207                                          * do this only after failure, so when you create the
10208                                          * trigger file, we still finish replaying as much as we
10209                                          * can from archive and pg_xlog before failover.
10210                                          */
10211                                         if (CheckForStandbyTrigger())
10212                                                 goto triggered;
10213                                 }
10214
10215                                 /*
10216                                  * This possibly-long loop needs to handle interrupts of
10217                                  * startup process.
10218                                  */
10219                                 HandleStartupProcInterrupts();
10220                         }
10221                 }
10222                 else
10223                 {
10224                         /* In archive or crash recovery. */
10225                         if (readFile < 0)
10226                         {
10227                                 int                     sources;
10228
10229                                 /* Reset curFileTLI if random fetch. */
10230                                 if (randAccess)
10231                                         curFileTLI = 0;
10232
10233                                 sources = XLOG_FROM_PG_XLOG;
10234                                 if (InArchiveRecovery)
10235                                         sources |= XLOG_FROM_ARCHIVE;
10236
10237                                 readFile = XLogFileReadAnyTLI(readId, readSeg, emode,
10238                                                                                           sources);
10239                                 switched_segment = true;
10240                                 if (readFile < 0)
10241                                         return false;
10242                         }
10243                 }
10244         }
10245
10246         /*
10247          * At this point, we have the right segment open and if we're streaming we
10248          * know the requested record is in it.
10249          */
10250         Assert(readFile != -1);
10251
10252         /*
10253          * If the current segment is being streamed from master, calculate how
10254          * much of the current page we have received already. We know the
10255          * requested record has been received, but this is for the benefit of
10256          * future calls, to allow quick exit at the top of this function.
10257          */
10258         if (readSource == XLOG_FROM_STREAM)
10259         {
10260                 if (RecPtr->xlogid != receivedUpto.xlogid ||
10261                         (RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ))
10262                 {
10263                         readLen = XLOG_BLCKSZ;
10264                 }
10265                 else
10266                         readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff;
10267         }
10268         else
10269                 readLen = XLOG_BLCKSZ;
10270
10271         if (switched_segment && targetPageOff != 0)
10272         {
10273                 /*
10274                  * Whenever switching to a new WAL segment, we read the first page of
10275                  * the file and validate its header, even if that's not where the
10276                  * target record is.  This is so that we can check the additional
10277                  * identification info that is present in the first page's "long"
10278                  * header.
10279                  */
10280                 readOff = 0;
10281                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10282                 {
10283                         ereport(emode_for_corrupt_record(emode, *RecPtr),
10284                                         (errcode_for_file_access(),
10285                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
10286                                                         readId, readSeg, readOff)));
10287                         goto next_record_is_invalid;
10288                 }
10289                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
10290                         goto next_record_is_invalid;
10291         }
10292
10293         /* Read the requested page */
10294         readOff = targetPageOff;
10295         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10296         {
10297                 ereport(emode_for_corrupt_record(emode, *RecPtr),
10298                                 (errcode_for_file_access(),
10299                  errmsg("could not seek in log file %u, segment %u to offset %u: %m",
10300                                 readId, readSeg, readOff)));
10301                 goto next_record_is_invalid;
10302         }
10303         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10304         {
10305                 ereport(emode_for_corrupt_record(emode, *RecPtr),
10306                                 (errcode_for_file_access(),
10307                  errmsg("could not read from log file %u, segment %u, offset %u: %m",
10308                                 readId, readSeg, readOff)));
10309                 goto next_record_is_invalid;
10310         }
10311         if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
10312                 goto next_record_is_invalid;
10313
10314         Assert(targetId == readId);
10315         Assert(targetSeg == readSeg);
10316         Assert(targetPageOff == readOff);
10317         Assert(targetRecOff < readLen);
10318
10319         return true;
10320
10321 next_record_is_invalid:
10322         failedSources |= readSource;
10323
10324         if (readFile >= 0)
10325                 close(readFile);
10326         readFile = -1;
10327         readLen = 0;
10328         readSource = 0;
10329
10330         /* In standby-mode, keep trying */
10331         if (StandbyMode)
10332                 goto retry;
10333         else
10334                 return false;
10335
10336 triggered:
10337         if (readFile >= 0)
10338                 close(readFile);
10339         readFile = -1;
10340         readLen = 0;
10341         readSource = 0;
10342
10343         return false;
10344 }
10345
10346 /*
10347  * Determine what log level should be used to report a corrupt WAL record
10348  * in the current WAL page, previously read by XLogPageRead().
10349  *
10350  * 'emode' is the error mode that would be used to report a file-not-found
10351  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
10352  * we're retrying the exact same record that we've tried previously, only
10353  * complain the first time to keep the noise down.      However, we only do when
10354  * reading from pg_xlog, because we don't expect any invalid records in archive
10355  * or in records streamed from master. Files in the archive should be complete,
10356  * and we should never hit the end of WAL because we stop and wait for more WAL
10357  * to arrive before replaying it.
10358  *
10359  * NOTE: This function remembers the RecPtr value it was last called with,
10360  * to suppress repeated messages about the same record. Only call this when
10361  * you are about to ereport(), or you might cause a later message to be
10362  * erroneously suppressed.
10363  */
10364 static int
10365 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
10366 {
10367         static XLogRecPtr lastComplaint = {0, 0};
10368
10369         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
10370         {
10371                 if (XLByteEQ(RecPtr, lastComplaint))
10372                         emode = DEBUG1;
10373                 else
10374                         lastComplaint = RecPtr;
10375         }
10376         return emode;
10377 }
10378
10379 /*
10380  * Check to see whether the user-specified trigger file exists and whether a
10381  * promote request has arrived.  If either condition holds, request postmaster
10382  * to shut down walreceiver, wait for it to exit, and return true.
10383  */
10384 static bool
10385 CheckForStandbyTrigger(void)
10386 {
10387         struct stat stat_buf;
10388         static bool triggered = false;
10389
10390         if (triggered)
10391                 return true;
10392
10393         if (IsPromoteTriggered())
10394         {
10395                 ereport(LOG,
10396                                 (errmsg("received promote request")));
10397                 ShutdownWalRcv();
10398                 ResetPromoteTriggered();
10399                 triggered = true;
10400                 return true;
10401         }
10402
10403         if (TriggerFile == NULL)
10404                 return false;
10405
10406         if (stat(TriggerFile, &stat_buf) == 0)
10407         {
10408                 ereport(LOG,
10409                                 (errmsg("trigger file found: %s", TriggerFile)));
10410                 ShutdownWalRcv();
10411                 unlink(TriggerFile);
10412                 triggered = true;
10413                 return true;
10414         }
10415         return false;
10416 }
10417
10418 /*
10419  * Check to see if a promote request has arrived. Should be
10420  * called by postmaster after receiving SIGUSR1.
10421  */
10422 bool
10423 CheckPromoteSignal(void)
10424 {
10425         struct stat stat_buf;
10426
10427         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
10428         {
10429                 /*
10430                  * Since we are in a signal handler, it's not safe to elog. We
10431                  * silently ignore any error from unlink.
10432                  */
10433                 unlink(PROMOTE_SIGNAL_FILE);
10434                 return true;
10435         }
10436         return false;
10437 }
10438
10439 /*
10440  * Wake up startup process to replay newly arrived WAL, or to notice that
10441  * failover has been requested.
10442  */
10443 void
10444 WakeupRecovery(void)
10445 {
10446         SetLatch(&XLogCtl->recoveryWakeupLatch);
10447 }
10448
10449 /*
10450  * Manage the WALWriterLatch
10451  */
10452 Latch *
10453 WALWriterLatch(void)
10454 {
10455         return &XLogCtl->WALWriterLatch;
10456 }